Skip to content

Commit 352899f

Browse files
mark-blochSaeed Mahameed
authored andcommitted
net/mlx5: Lag, use buckets in hash mode
When in hardware lag and the NIC has more than 2 ports when one port goes down need to distribute the traffic between the remaining active ports. For better spread in such cases instead of using 1-to-1 mapping and only 4 slots in the hash, use many. Each port will have many slots that point to it. When a port goes down go over all the slots that pointed to that port and spread them between the remaining active ports. Once the port comes back restore the default mapping. We will have number_of_ports * MLX5_LAG_MAX_HASH_BUCKETS slots. Each MLX5_LAG_MAX_HASH_BUCKETS belong to a different port. The native mapping is such that: port 1: The first MLX5_LAG_MAX_HASH_BUCKETS slots are: [1, 1, .., 1] which means if a packet is hased into one of this slots it will hit the wire via port 1. port 2: The second MLX5_LAG_MAX_HASH_BUCKETS slots are: [2, 2, .., 2] which means if a packet is hased into one of this slots it will hit the wire via port2. and this mapping is the same of the rest of the ports. On a failover, lets say port 2 goes down (port 1, 3, 4 are still up). the new mapping for port 2 will be: port 2: The second MLX5_LAG_MAX_HASH_BUCKETS are: [1, 3, 1, 4, .., 4] which means the mapping was changed from the native mapping to a mapping that consists of only the active ports. With this if a port goes down the traffic will be split between the active ports randomly Signed-off-by: Mark Bloch <mbloch@nvidia.com> Reviewed-by: Maor Gottlieb <maorg@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
1 parent 24b3599 commit 352899f

File tree

4 files changed

+182
-76
lines changed

4 files changed

+182
-76
lines changed

drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c

Lines changed: 113 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,73 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
107107
}
108108
EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
109109

110+
static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
111+
u8 *ports, int *num_disabled)
112+
{
113+
int i;
114+
115+
*num_disabled = 0;
116+
for (i = 0; i < num_ports; i++) {
117+
if (!tracker->netdev_state[i].tx_enabled ||
118+
!tracker->netdev_state[i].link_up)
119+
ports[(*num_disabled)++] = i;
120+
}
121+
}
122+
123+
static void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
124+
u8 *ports, int *num_enabled)
125+
{
126+
int i;
127+
128+
*num_enabled = 0;
129+
for (i = 0; i < num_ports; i++) {
130+
if (tracker->netdev_state[i].tx_enabled &&
131+
tracker->netdev_state[i].link_up)
132+
ports[(*num_enabled)++] = i;
133+
}
134+
135+
if (*num_enabled == 0)
136+
mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
137+
}
138+
110139
static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
111-
struct mlx5_lag *ldev)
140+
struct mlx5_lag *ldev,
141+
struct lag_tracker *tracker,
142+
u8 flags)
112143
{
144+
char buf[MLX5_MAX_PORTS * 10 + 1] = {};
145+
u8 enabled_ports[MLX5_MAX_PORTS] = {};
146+
int written = 0;
147+
int num_enabled;
148+
int idx;
149+
int err;
113150
int i;
151+
int j;
114152

115-
mlx5_core_info(dev, "lag map:\n");
116-
for (i = 0; i < ldev->ports; i++)
117-
mlx5_core_info(dev, "\tport %d:%d\n", i + 1, ldev->v2p_map[i]);
153+
if (flags & MLX5_LAG_FLAG_HASH_BASED) {
154+
mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
155+
&num_enabled);
156+
for (i = 0; i < num_enabled; i++) {
157+
err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
158+
if (err != 3)
159+
return;
160+
written += err;
161+
}
162+
buf[written - 2] = 0;
163+
mlx5_core_info(dev, "lag map active ports: %s\n", buf);
164+
} else {
165+
for (i = 0; i < ldev->ports; i++) {
166+
for (j = 0; j < ldev->buckets; j++) {
167+
idx = i * ldev->buckets + j;
168+
err = scnprintf(buf + written, 10,
169+
" port %d:%d", i + 1, ldev->v2p_map[idx]);
170+
if (err != 9)
171+
return;
172+
written += err;
173+
}
174+
}
175+
mlx5_core_info(dev, "lag map:%s\n", buf);
176+
}
118177
}
119178

120179
static int mlx5_lag_netdev_event(struct notifier_block *this,
@@ -174,6 +233,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
174233
mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
175234
err);
176235
ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
236+
ldev->buckets = 1;
177237

178238
return ldev;
179239
}
@@ -200,28 +260,25 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
200260
return !!(ldev->flags & MLX5_LAG_FLAG_SRIOV);
201261
}
202262

203-
static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
204-
u8 *ports, int *num_disabled)
205-
{
206-
int i;
207-
208-
*num_disabled = 0;
209-
for (i = 0; i < num_ports; i++) {
210-
if (!tracker->netdev_state[i].tx_enabled ||
211-
!tracker->netdev_state[i].link_up)
212-
ports[(*num_disabled)++] = i;
213-
}
214-
}
215-
263+
/* Create a mapping between steering slots and active ports.
264+
* As we have ldev->buckets slots per port first assume the native
265+
* mapping should be used.
266+
* If there are ports that are disabled fill the relevant slots
267+
* with mapping that points to active ports.
268+
*/
216269
static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
217-
u8 num_ports, u8 *ports)
270+
u8 num_ports,
271+
u8 buckets,
272+
u8 *ports)
218273
{
219274
int disabled[MLX5_MAX_PORTS] = {};
220275
int enabled[MLX5_MAX_PORTS] = {};
221276
int disabled_ports_num = 0;
222277
int enabled_ports_num = 0;
278+
int idx;
223279
u32 rand;
224280
int i;
281+
int j;
225282

226283
for (i = 0; i < num_ports; i++) {
227284
if (tracker->netdev_state[i].tx_enabled &&
@@ -231,9 +288,14 @@ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
231288
disabled[disabled_ports_num++] = i;
232289
}
233290

234-
/* Use native mapping by default */
291+
/* Use native mapping by default where each port's buckets
292+
* point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
293+
*/
235294
for (i = 0; i < num_ports; i++)
236-
ports[i] = MLX5_LAG_EGRESS_PORT_1 + i;
295+
for (j = 0; j < buckets; j++) {
296+
idx = i * buckets + j;
297+
ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
298+
}
237299

238300
/* If all ports are disabled/enabled keep native mapping */
239301
if (enabled_ports_num == num_ports ||
@@ -242,9 +304,10 @@ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
242304

243305
/* Go over the disabled ports and for each assign a random active port */
244306
for (i = 0; i < disabled_ports_num; i++) {
245-
get_random_bytes(&rand, 4);
246-
247-
ports[disabled[i]] = enabled[rand % enabled_ports_num] + 1;
307+
for (j = 0; j < buckets; j++) {
308+
get_random_bytes(&rand, 4);
309+
ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
310+
}
248311
}
249312
}
250313

@@ -317,28 +380,33 @@ static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
317380
void mlx5_modify_lag(struct mlx5_lag *ldev,
318381
struct lag_tracker *tracker)
319382
{
383+
u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
320384
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
321-
u8 ports[MLX5_MAX_PORTS] = {};
385+
int idx;
322386
int err;
323387
int i;
388+
int j;
324389

325-
mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ports);
390+
mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
326391

327392
for (i = 0; i < ldev->ports; i++) {
328-
if (ports[i] == ldev->v2p_map[i])
329-
continue;
330-
err = _mlx5_modify_lag(ldev, ports);
331-
if (err) {
332-
mlx5_core_err(dev0,
333-
"Failed to modify LAG (%d)\n",
334-
err);
335-
return;
336-
}
337-
memcpy(ldev->v2p_map, ports, sizeof(ports[0]) *
338-
ldev->ports);
393+
for (j = 0; j < ldev->buckets; j++) {
394+
idx = i * ldev->buckets + j;
395+
if (ports[idx] == ldev->v2p_map[idx])
396+
continue;
397+
err = _mlx5_modify_lag(ldev, ports);
398+
if (err) {
399+
mlx5_core_err(dev0,
400+
"Failed to modify LAG (%d)\n",
401+
err);
402+
return;
403+
}
404+
memcpy(ldev->v2p_map, ports, sizeof(ports));
339405

340-
mlx5_lag_print_mapping(dev0, ldev);
341-
break;
406+
mlx5_lag_print_mapping(dev0, ldev, tracker,
407+
ldev->flags);
408+
break;
409+
}
342410
}
343411

344412
if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
@@ -357,6 +425,8 @@ static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
357425
if (!MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table))
358426
return -EINVAL;
359427
*flags |= MLX5_LAG_FLAG_HASH_BASED;
428+
if (ldev->ports > 2)
429+
ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
360430
}
361431

362432
return 0;
@@ -370,6 +440,7 @@ static int mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
370440
if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
371441
tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
372442
*flags |= MLX5_LAG_FLAG_HASH_BASED;
443+
373444
return 0;
374445
}
375446

@@ -399,7 +470,7 @@ static int mlx5_create_lag(struct mlx5_lag *ldev,
399470
u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
400471
int err;
401472

402-
mlx5_lag_print_mapping(dev0, ldev);
473+
mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
403474
mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
404475
shared_fdb, get_str_port_sel_mode(flags));
405476

@@ -439,11 +510,12 @@ int mlx5_activate_lag(struct mlx5_lag *ldev,
439510
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
440511
int err;
441512

442-
mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->v2p_map);
443513
err = mlx5_lag_set_port_sel_mode(ldev, tracker, &flags);
444514
if (err)
445515
return err;
446516

517+
mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
518+
447519
if (flags & MLX5_LAG_FLAG_HASH_BASED) {
448520
err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
449521
ldev->v2p_map);
@@ -1265,7 +1337,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
12651337
}
12661338
}
12671339

1268-
port = ldev->v2p_map[port];
1340+
port = ldev->v2p_map[port * ldev->buckets];
12691341

12701342
unlock:
12711343
spin_unlock(&lag_lock);

drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#ifndef __MLX5_LAG_H__
55
#define __MLX5_LAG_H__
66

7+
#define MLX5_LAG_MAX_HASH_BUCKETS 16
78
#include "mlx5_core.h"
89
#include "mp.h"
910
#include "port_sel.h"
@@ -46,9 +47,10 @@ struct lag_tracker {
4647
struct mlx5_lag {
4748
u8 flags;
4849
u8 ports;
50+
u8 buckets;
4951
int mode_changes_in_progress;
5052
bool shared_fdb;
51-
u8 v2p_map[MLX5_MAX_PORTS];
53+
u8 v2p_map[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS];
5254
struct kref ref;
5355
struct lag_func pf[MLX5_MAX_PORTS];
5456
struct lag_tracker tracker;

0 commit comments

Comments
 (0)