Skip to content

Commit d2e42a1

Browse files
committed
Merge branch 'ndo_set_rx_headroom'
Paolo Abeni says: ==================== bridge/ovs: avoid skb head copy on frame forwarding Currently, while when an OVS or Linux bridge is used to forward frames towards some tunnel device, a skb_head_copy() may occur if the ingress device do not provide enough headroom for the tx encapsulation. This patch series tries to address the issue implementing a new ndo operation to allow the master device to control the headroom used when allocating the skb on frame reception. Said operation is used by the Linux bridge to notify the bridged ports of needed_headroom changes, and similar bookkeeping and behaviour is also added to openvswitch, on a per datapath basis. Finally, the operation is implemented for veth and tun device, which give performance improvement in the 6-12% range when forwarding frames from said devices towards a vxlan tunnel. v2: - fix netdev_get_fwd_headroom() behaviour - remove some code duplication with the netdev_set_rx_headroom() and netdev_reset_rx_headroom() helpers - handle headroom reset on [v]port removal/deletion - initialize tun align to the old default value v3: - fix a comment typo ==================== Acked-by: Pravin B Shelar <pshelar@ovn.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents 46d5efa + 163e529 commit d2e42a1

File tree

7 files changed

+161
-4
lines changed

7 files changed

+161
-4
lines changed

drivers/net/tun.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ struct tun_struct {
187187
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
188188
NETIF_F_TSO6|NETIF_F_UFO)
189189

190+
int align;
190191
int vnet_hdr_sz;
191192
int sndbuf;
192193
struct tap_filter txflt;
@@ -934,6 +935,17 @@ static void tun_poll_controller(struct net_device *dev)
934935
return;
935936
}
936937
#endif
938+
939+
static void tun_set_headroom(struct net_device *dev, int new_hr)
940+
{
941+
struct tun_struct *tun = netdev_priv(dev);
942+
943+
if (new_hr < NET_SKB_PAD)
944+
new_hr = NET_SKB_PAD;
945+
946+
tun->align = new_hr;
947+
}
948+
937949
static const struct net_device_ops tun_netdev_ops = {
938950
.ndo_uninit = tun_net_uninit,
939951
.ndo_open = tun_net_open,
@@ -945,6 +957,7 @@ static const struct net_device_ops tun_netdev_ops = {
945957
#ifdef CONFIG_NET_POLL_CONTROLLER
946958
.ndo_poll_controller = tun_poll_controller,
947959
#endif
960+
.ndo_set_rx_headroom = tun_set_headroom,
948961
};
949962

950963
static const struct net_device_ops tap_netdev_ops = {
@@ -962,6 +975,7 @@ static const struct net_device_ops tap_netdev_ops = {
962975
.ndo_poll_controller = tun_poll_controller,
963976
#endif
964977
.ndo_features_check = passthru_features_check,
978+
.ndo_set_rx_headroom = tun_set_headroom,
965979
};
966980

967981
static void tun_flow_init(struct tun_struct *tun)
@@ -1086,7 +1100,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
10861100
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
10871101
struct sk_buff *skb;
10881102
size_t total_len = iov_iter_count(from);
1089-
size_t len = total_len, align = NET_SKB_PAD, linear;
1103+
size_t len = total_len, align = tun->align, linear;
10901104
struct virtio_net_hdr gso = { 0 };
10911105
int good_linear;
10921106
int copylen;
@@ -1694,6 +1708,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
16941708
tun->txflt.count = 0;
16951709
tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
16961710

1711+
tun->align = NET_SKB_PAD;
16971712
tun->filter_attached = false;
16981713
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
16991714

drivers/net/veth.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ struct pcpu_vstats {
3535
struct veth_priv {
3636
struct net_device __rcu *peer;
3737
atomic64_t dropped;
38+
unsigned requested_headroom;
3839
};
3940

4041
/*
@@ -271,6 +272,29 @@ static int veth_get_iflink(const struct net_device *dev)
271272
return iflink;
272273
}
273274

275+
static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
276+
{
277+
struct veth_priv *peer_priv, *priv = netdev_priv(dev);
278+
struct net_device *peer;
279+
280+
if (new_hr < 0)
281+
new_hr = 0;
282+
283+
rcu_read_lock();
284+
peer = rcu_dereference(priv->peer);
285+
if (unlikely(!peer))
286+
goto out;
287+
288+
peer_priv = netdev_priv(peer);
289+
priv->requested_headroom = new_hr;
290+
new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
291+
dev->needed_headroom = new_hr;
292+
peer->needed_headroom = new_hr;
293+
294+
out:
295+
rcu_read_unlock();
296+
}
297+
274298
static const struct net_device_ops veth_netdev_ops = {
275299
.ndo_init = veth_dev_init,
276300
.ndo_open = veth_open,
@@ -285,6 +309,7 @@ static const struct net_device_ops veth_netdev_ops = {
285309
#endif
286310
.ndo_get_iflink = veth_get_iflink,
287311
.ndo_features_check = passthru_features_check,
312+
.ndo_set_rx_headroom = veth_set_rx_headroom,
288313
};
289314

290315
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
@@ -301,6 +326,7 @@ static void veth_setup(struct net_device *dev)
301326
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
302327
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
303328
dev->priv_flags |= IFF_NO_QUEUE;
329+
dev->priv_flags |= IFF_PHONY_HEADROOM;
304330

305331
dev->netdev_ops = &veth_netdev_ops;
306332
dev->ethtool_ops = &veth_ethtool_ops;

include/linux/netdevice.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,6 +1093,12 @@ struct tc_to_netdev {
10931093
* This function is used to get egress tunnel information for given skb.
10941094
* This is useful for retrieving outer tunnel header parameters while
10951095
* sampling packet.
1096+
* void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
1097+
* This function is used to specify the headroom that the skb must
1098+
* consider when allocation skb during packet reception. Setting
1099+
* appropriate rx headroom value allows avoiding skb head copy on
1100+
* forward. Setting a negative value reset the rx headroom to the
1101+
* default value.
10961102
*
10971103
*/
10981104
struct net_device_ops {
@@ -1278,6 +1284,8 @@ struct net_device_ops {
12781284
bool proto_down);
12791285
int (*ndo_fill_metadata_dst)(struct net_device *dev,
12801286
struct sk_buff *skb);
1287+
void (*ndo_set_rx_headroom)(struct net_device *dev,
1288+
int needed_headroom);
12811289
};
12821290

12831291
/**
@@ -1315,6 +1323,8 @@ struct net_device_ops {
13151323
* @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
13161324
* @IFF_TEAM: device is a team device
13171325
* @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
1326+
* @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
1327+
* entity (i.e. the master device for bridged veth)
13181328
*/
13191329
enum netdev_priv_flags {
13201330
IFF_802_1Q_VLAN = 1<<0,
@@ -1343,6 +1353,7 @@ enum netdev_priv_flags {
13431353
IFF_L3MDEV_SLAVE = 1<<23,
13441354
IFF_TEAM = 1<<24,
13451355
IFF_RXFH_CONFIGURED = 1<<25,
1356+
IFF_PHONY_HEADROOM = 1<<26,
13461357
};
13471358

13481359
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
19371948
struct sk_buff *skb,
19381949
void *accel_priv);
19391950

1951+
/* returns the headroom that the master device needs to take in account
1952+
* when forwarding to this dev
1953+
*/
1954+
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
1955+
{
1956+
return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
1957+
}
1958+
1959+
static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
1960+
{
1961+
if (dev->netdev_ops->ndo_set_rx_headroom)
1962+
dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
1963+
}
1964+
1965+
/* set the device rx headroom to the dev's default */
1966+
static inline void netdev_reset_rx_headroom(struct net_device *dev)
1967+
{
1968+
netdev_set_rx_headroom(dev, -1);
1969+
}
1970+
19401971
/*
19411972
* Net namespace inlines
19421973
*/

net/bridge/br_if.c

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head)
223223
destroy_nbp(p);
224224
}
225225

226+
static unsigned get_max_headroom(struct net_bridge *br)
227+
{
228+
unsigned max_headroom = 0;
229+
struct net_bridge_port *p;
230+
231+
list_for_each_entry(p, &br->port_list, list) {
232+
unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
233+
234+
if (dev_headroom > max_headroom)
235+
max_headroom = dev_headroom;
236+
}
237+
238+
return max_headroom;
239+
}
240+
241+
static void update_headroom(struct net_bridge *br, int new_hr)
242+
{
243+
struct net_bridge_port *p;
244+
245+
list_for_each_entry(p, &br->port_list, list)
246+
netdev_set_rx_headroom(p->dev, new_hr);
247+
248+
br->dev->needed_headroom = new_hr;
249+
}
250+
226251
/* Delete port(interface) from bridge is done in two steps.
227252
* via RCU. First step, marks device as down. That deletes
228253
* all the timers and stops new packets from flowing through.
@@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p)
248273
br_ifinfo_notify(RTM_DELLINK, p);
249274

250275
list_del_rcu(&p->list);
276+
if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
277+
update_headroom(br, get_max_headroom(br));
278+
netdev_reset_rx_headroom(dev);
251279

252280
nbp_vlan_flush(p);
253281
br_fdb_delete_by_port(br, p, 0, 1);
@@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
438466
{
439467
struct net_bridge_port *p;
440468
int err = 0;
469+
unsigned br_hr, dev_hr;
441470
bool changed_addr;
442471

443472
/* Don't allow bridging non-ethernet like devices, or DSA-enabled
@@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
505534

506535
netdev_update_features(br->dev);
507536

508-
if (br->dev->needed_headroom < dev->needed_headroom)
509-
br->dev->needed_headroom = dev->needed_headroom;
537+
br_hr = br->dev->needed_headroom;
538+
dev_hr = netdev_get_fwd_headroom(dev);
539+
if (br_hr < dev_hr)
540+
update_headroom(br, dev_hr);
541+
else
542+
netdev_set_rx_headroom(dev, br_hr);
510543

511544
if (br_fdb_insert(br, p, dev->dev_addr, 0))
512545
netdev_err(dev, "failed insert local address bridge forwarding table\n");

net/openvswitch/datapath.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net,
19081908
return ERR_PTR(-EINVAL);
19091909
}
19101910

1911+
/* Called with ovs_mutex */
1912+
static void update_headroom(struct datapath *dp)
1913+
{
1914+
unsigned dev_headroom, max_headroom = 0;
1915+
struct net_device *dev;
1916+
struct vport *vport;
1917+
int i;
1918+
1919+
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1920+
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
1921+
dev = vport->dev;
1922+
dev_headroom = netdev_get_fwd_headroom(dev);
1923+
if (dev_headroom > max_headroom)
1924+
max_headroom = dev_headroom;
1925+
}
1926+
}
1927+
1928+
dp->max_headroom = max_headroom;
1929+
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1930+
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
1931+
netdev_set_rx_headroom(vport->dev, max_headroom);
1932+
}
1933+
19111934
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
19121935
{
19131936
struct nlattr **a = info->attrs;
@@ -1973,6 +1996,12 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
19731996

19741997
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
19751998
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
1999+
2000+
if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
2001+
update_headroom(dp);
2002+
else
2003+
netdev_set_rx_headroom(vport->dev, dp->max_headroom);
2004+
19762005
BUG_ON(err < 0);
19772006
ovs_unlock();
19782007

@@ -2039,8 +2068,10 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
20392068

20402069
static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
20412070
{
2071+
bool must_update_headroom = false;
20422072
struct nlattr **a = info->attrs;
20432073
struct sk_buff *reply;
2074+
struct datapath *dp;
20442075
struct vport *vport;
20452076
int err;
20462077

@@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
20622093
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
20632094
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
20642095
BUG_ON(err < 0);
2096+
2097+
/* the vport deletion may trigger dp headroom update */
2098+
dp = vport->dp;
2099+
if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
2100+
must_update_headroom = true;
2101+
netdev_reset_rx_headroom(vport->dev);
20652102
ovs_dp_detach_port(vport);
2103+
2104+
if (must_update_headroom)
2105+
update_headroom(dp);
20662106
ovs_unlock();
20672107

20682108
ovs_notify(&dp_vport_genl_family, reply, info);

net/openvswitch/datapath.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ struct dp_stats_percpu {
6868
* ovs_mutex and RCU.
6969
* @stats_percpu: Per-CPU datapath statistics.
7070
* @net: Reference to net namespace.
71+
* @max_headroom: the maximum headroom of all vports in this datapath; it will
72+
* be used by all the internal vports in this dp.
7173
*
7274
* Context: See the comment on locking at the top of datapath.c for additional
7375
* locking information.
@@ -89,6 +91,8 @@ struct datapath {
8991
possible_net_t net;
9092

9193
u32 user_features;
94+
95+
u32 max_headroom;
9296
};
9397

9498
/**

net/openvswitch/vport-internal_dev.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,19 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
138138
return stats;
139139
}
140140

141+
void internal_set_rx_headroom(struct net_device *dev, int new_hr)
142+
{
143+
dev->needed_headroom = new_hr;
144+
}
145+
141146
static const struct net_device_ops internal_dev_netdev_ops = {
142147
.ndo_open = internal_dev_open,
143148
.ndo_stop = internal_dev_stop,
144149
.ndo_start_xmit = internal_dev_xmit,
145150
.ndo_set_mac_address = eth_mac_addr,
146151
.ndo_change_mtu = internal_dev_change_mtu,
147152
.ndo_get_stats64 = internal_get_stats,
153+
.ndo_set_rx_headroom = internal_set_rx_headroom,
148154
};
149155

150156
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
158164
netdev->netdev_ops = &internal_dev_netdev_ops;
159165

160166
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
161-
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
167+
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
168+
IFF_PHONY_HEADROOM;
162169
netdev->destructor = internal_dev_destructor;
163170
netdev->ethtool_ops = &internal_dev_ethtool_ops;
164171
netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
199206
err = -ENOMEM;
200207
goto error_free_netdev;
201208
}
209+
vport->dev->needed_headroom = vport->dp->max_headroom;
202210

203211
dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
204212
internal_dev = internal_dev_priv(vport->dev);

0 commit comments

Comments
 (0)