Skip to content

Commit b587bda

Browse files
Moshe Shemeshdavem330
authored andcommitted
devlink: Change devlink health locking mechanism
The devlink health reporters create/destroy and user commands currently use the devlink->lock as a locking mechanism. Different reporters have different rules in the driver and are being created/destroyed during different stages of driver load/unload/running. So during execution of a reporter recover the flow can go through another reporter's destroy and create. Such flow leads to deadlock trying to lock a mutex already held. With the new locking mechanism the different reporters share mutex lock only to protect access to shared reporters list. Added refcount per reporter, to protect the reporters from destroy while being used. Signed-off-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 5be90f9 commit b587bda

File tree

2 files changed

+75
-23
lines changed

2 files changed

+75
-23
lines changed

include/net/devlink.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct devlink {
3232
struct list_head region_list;
3333
u32 snapshot_id;
3434
struct list_head reporter_list;
35+
struct mutex reporters_lock; /* protects reporter_list */
3536
struct devlink_dpipe_headers *dpipe_headers;
3637
const struct devlink_ops *ops;
3738
struct device *dev;

net/core/devlink.c

Lines changed: 74 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <linux/list.h>
2121
#include <linux/netdevice.h>
2222
#include <linux/spinlock.h>
23+
#include <linux/refcount.h>
2324
#include <rdma/ib_verbs.h>
2425
#include <net/netlink.h>
2526
#include <net/genetlink.h>
@@ -4432,6 +4433,7 @@ struct devlink_health_reporter {
44324433
u64 error_count;
44334434
u64 recovery_count;
44344435
u64 last_recovery_ts;
4436+
refcount_t refcount;
44354437
};
44364438

44374439
void *
@@ -4447,6 +4449,7 @@ devlink_health_reporter_find_by_name(struct devlink *devlink,
44474449
{
44484450
struct devlink_health_reporter *reporter;
44494451

4452+
lockdep_assert_held(&devlink->reporters_lock);
44504453
list_for_each_entry(reporter, &devlink->reporter_list, list)
44514454
if (!strcmp(reporter->ops->name, reporter_name))
44524455
return reporter;
@@ -4470,7 +4473,7 @@ devlink_health_reporter_create(struct devlink *devlink,
44704473
{
44714474
struct devlink_health_reporter *reporter;
44724475

4473-
mutex_lock(&devlink->lock);
4476+
mutex_lock(&devlink->reporters_lock);
44744477
if (devlink_health_reporter_find_by_name(devlink, ops->name)) {
44754478
reporter = ERR_PTR(-EEXIST);
44764479
goto unlock;
@@ -4494,9 +4497,10 @@ devlink_health_reporter_create(struct devlink *devlink,
44944497
reporter->graceful_period = graceful_period;
44954498
reporter->auto_recover = auto_recover;
44964499
mutex_init(&reporter->dump_lock);
4500+
refcount_set(&reporter->refcount, 1);
44974501
list_add_tail(&reporter->list, &devlink->reporter_list);
44984502
unlock:
4499-
mutex_unlock(&devlink->lock);
4503+
mutex_unlock(&devlink->reporters_lock);
45004504
return reporter;
45014505
}
45024506
EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
@@ -4509,10 +4513,12 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
45094513
void
45104514
devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
45114515
{
4512-
mutex_lock(&reporter->devlink->lock);
4516+
mutex_lock(&reporter->devlink->reporters_lock);
45134517
list_del(&reporter->list);
4518+
mutex_unlock(&reporter->devlink->reporters_lock);
4519+
while (refcount_read(&reporter->refcount) > 1)
4520+
msleep(100);
45144521
mutex_destroy(&reporter->dump_lock);
4515-
mutex_unlock(&reporter->devlink->lock);
45164522
if (reporter->dump_fmsg)
45174523
devlink_fmsg_free(reporter->dump_fmsg);
45184524
kfree(reporter);
@@ -4648,14 +4654,26 @@ static struct devlink_health_reporter *
46484654
devlink_health_reporter_get_from_info(struct devlink *devlink,
46494655
struct genl_info *info)
46504656
{
4657+
struct devlink_health_reporter *reporter;
46514658
char *reporter_name;
46524659

46534660
if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
46544661
return NULL;
46554662

46564663
reporter_name =
46574664
nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
4658-
return devlink_health_reporter_find_by_name(devlink, reporter_name);
4665+
mutex_lock(&devlink->reporters_lock);
4666+
reporter = devlink_health_reporter_find_by_name(devlink, reporter_name);
4667+
if (reporter)
4668+
refcount_inc(&reporter->refcount);
4669+
mutex_unlock(&devlink->reporters_lock);
4670+
return reporter;
4671+
}
4672+
4673+
static void
4674+
devlink_health_reporter_put(struct devlink_health_reporter *reporter)
4675+
{
4676+
refcount_dec(&reporter->refcount);
46594677
}
46604678

46614679
static int
@@ -4730,19 +4748,24 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
47304748
return -EINVAL;
47314749

47324750
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4733-
if (!msg)
4734-
return -ENOMEM;
4751+
if (!msg) {
4752+
err = -ENOMEM;
4753+
goto out;
4754+
}
47354755

47364756
err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
47374757
DEVLINK_CMD_HEALTH_REPORTER_GET,
47384758
info->snd_portid, info->snd_seq,
47394759
0);
47404760
if (err) {
47414761
nlmsg_free(msg);
4742-
return err;
4762+
goto out;
47434763
}
47444764

4745-
return genlmsg_reply(msg, info);
4765+
err = genlmsg_reply(msg, info);
4766+
out:
4767+
devlink_health_reporter_put(reporter);
4768+
return err;
47464769
}
47474770

47484771
static int
@@ -4759,7 +4782,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
47594782
list_for_each_entry(devlink, &devlink_list, list) {
47604783
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
47614784
continue;
4762-
mutex_lock(&devlink->lock);
4785+
mutex_lock(&devlink->reporters_lock);
47634786
list_for_each_entry(reporter, &devlink->reporter_list,
47644787
list) {
47654788
if (idx < start) {
@@ -4773,12 +4796,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
47734796
cb->nlh->nlmsg_seq,
47744797
NLM_F_MULTI);
47754798
if (err) {
4776-
mutex_unlock(&devlink->lock);
4799+
mutex_unlock(&devlink->reporters_lock);
47774800
goto out;
47784801
}
47794802
idx++;
47804803
}
4781-
mutex_unlock(&devlink->lock);
4804+
mutex_unlock(&devlink->reporters_lock);
47824805
}
47834806
out:
47844807
mutex_unlock(&devlink_mutex);
@@ -4793,15 +4816,18 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
47934816
{
47944817
struct devlink *devlink = info->user_ptr[0];
47954818
struct devlink_health_reporter *reporter;
4819+
int err;
47964820

47974821
reporter = devlink_health_reporter_get_from_info(devlink, info);
47984822
if (!reporter)
47994823
return -EINVAL;
48004824

48014825
if (!reporter->ops->recover &&
48024826
(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
4803-
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
4804-
return -EOPNOTSUPP;
4827+
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) {
4828+
err = -EOPNOTSUPP;
4829+
goto out;
4830+
}
48054831

48064832
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
48074833
reporter->graceful_period =
@@ -4811,20 +4837,28 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
48114837
reporter->auto_recover =
48124838
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
48134839

4840+
devlink_health_reporter_put(reporter);
48144841
return 0;
4842+
out:
4843+
devlink_health_reporter_put(reporter);
4844+
return err;
48154845
}
48164846

48174847
static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
48184848
struct genl_info *info)
48194849
{
48204850
struct devlink *devlink = info->user_ptr[0];
48214851
struct devlink_health_reporter *reporter;
4852+
int err;
48224853

48234854
reporter = devlink_health_reporter_get_from_info(devlink, info);
48244855
if (!reporter)
48254856
return -EINVAL;
48264857

4827-
return devlink_health_reporter_recover(reporter, NULL);
4858+
err = devlink_health_reporter_recover(reporter, NULL);
4859+
4860+
devlink_health_reporter_put(reporter);
4861+
return err;
48284862
}
48294863

48304864
static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
@@ -4839,12 +4873,16 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
48394873
if (!reporter)
48404874
return -EINVAL;
48414875

4842-
if (!reporter->ops->diagnose)
4876+
if (!reporter->ops->diagnose) {
4877+
devlink_health_reporter_put(reporter);
48434878
return -EOPNOTSUPP;
4879+
}
48444880

48454881
fmsg = devlink_fmsg_alloc();
4846-
if (!fmsg)
4882+
if (!fmsg) {
4883+
devlink_health_reporter_put(reporter);
48474884
return -ENOMEM;
4885+
}
48484886

48494887
err = devlink_fmsg_obj_nest_start(fmsg);
48504888
if (err)
@@ -4863,6 +4901,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
48634901

48644902
out:
48654903
devlink_fmsg_free(fmsg);
4904+
devlink_health_reporter_put(reporter);
48664905
return err;
48674906
}
48684907

@@ -4877,8 +4916,10 @@ static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
48774916
if (!reporter)
48784917
return -EINVAL;
48794918

4880-
if (!reporter->ops->dump)
4919+
if (!reporter->ops->dump) {
4920+
devlink_health_reporter_put(reporter);
48814921
return -EOPNOTSUPP;
4922+
}
48824923

48834924
mutex_lock(&reporter->dump_lock);
48844925
err = devlink_health_do_dump(reporter, NULL);
@@ -4890,6 +4931,7 @@ static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
48904931

48914932
out:
48924933
mutex_unlock(&reporter->dump_lock);
4934+
devlink_health_reporter_put(reporter);
48934935
return err;
48944936
}
48954937

@@ -4904,12 +4946,15 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
49044946
if (!reporter)
49054947
return -EINVAL;
49064948

4907-
if (!reporter->ops->dump)
4949+
if (!reporter->ops->dump) {
4950+
devlink_health_reporter_put(reporter);
49084951
return -EOPNOTSUPP;
4952+
}
49094953

49104954
mutex_lock(&reporter->dump_lock);
49114955
devlink_health_dump_clear(reporter);
49124956
mutex_unlock(&reporter->dump_lock);
4957+
devlink_health_reporter_put(reporter);
49134958
return 0;
49144959
}
49154960

@@ -5191,29 +5236,33 @@ static const struct genl_ops devlink_nl_ops[] = {
51915236
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
51925237
.doit = devlink_nl_cmd_health_reporter_get_doit,
51935238
.dumpit = devlink_nl_cmd_health_reporter_get_dumpit,
5194-
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
5239+
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
5240+
DEVLINK_NL_FLAG_NO_LOCK,
51955241
/* can be retrieved by unprivileged users */
51965242
},
51975243
{
51985244
.cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
51995245
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
52005246
.doit = devlink_nl_cmd_health_reporter_set_doit,
52015247
.flags = GENL_ADMIN_PERM,
5202-
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
5248+
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
5249+
DEVLINK_NL_FLAG_NO_LOCK,
52035250
},
52045251
{
52055252
.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
52065253
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
52075254
.doit = devlink_nl_cmd_health_reporter_recover_doit,
52085255
.flags = GENL_ADMIN_PERM,
5209-
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
5256+
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
5257+
DEVLINK_NL_FLAG_NO_LOCK,
52105258
},
52115259
{
52125260
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
52135261
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
52145262
.doit = devlink_nl_cmd_health_reporter_diagnose_doit,
52155263
.flags = GENL_ADMIN_PERM,
5216-
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
5264+
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
5265+
DEVLINK_NL_FLAG_NO_LOCK,
52175266
},
52185267
{
52195268
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
@@ -5284,6 +5333,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
52845333
INIT_LIST_HEAD(&devlink->region_list);
52855334
INIT_LIST_HEAD(&devlink->reporter_list);
52865335
mutex_init(&devlink->lock);
5336+
mutex_init(&devlink->reporters_lock);
52875337
return devlink;
52885338
}
52895339
EXPORT_SYMBOL_GPL(devlink_alloc);
@@ -5326,6 +5376,7 @@ EXPORT_SYMBOL_GPL(devlink_unregister);
53265376
*/
53275377
void devlink_free(struct devlink *devlink)
53285378
{
5379+
mutex_destroy(&devlink->reporters_lock);
53295380
mutex_destroy(&devlink->lock);
53305381
WARN_ON(!list_empty(&devlink->reporter_list));
53315382
WARN_ON(!list_empty(&devlink->region_list));

0 commit comments

Comments
 (0)