Skip to content

Commit

Permalink
RH7: PCI: hv: Fix the affinity setting for the NVMe crash
Browse files Browse the repository at this point in the history
In the case of cpumask_equal(mask, cpu_online_mask) == false, "mask" may
be a superset of "cfg->domain", and the real affinity is still saved in
"cfg->domain", after __ioapic_set_affinity() returns. See the line
"cpumask_copy(cfg->domain, tmp_mask);" in RHEL 7.x's kernel function
__assign_irq_vector().

So we should always use "cfg->domain", otherwise the NVMe driver may
fail to receive the expected interrupt, and later the buggy error
handling code in nvme_dev_disable() can cause the below panic:

[   71.695565] nvme nvme7: I/O 19 QID 0 timeout, disable controller
[   71.724221] ------------[ cut here ]------------
[   71.725067] WARNING: CPU: 4 PID: 11317 at kernel/irq/manage.c:1348 __free_irq+0xb3/0x280
[   71.725067] Trying to free already-free IRQ 226
[   71.725067] Modules linked in: ...
[   71.725067] CPU: 4 PID: 11317 Comm: kworker/4:1H Tainted: G OE  ------------ T 3.10.0-957.10.1.el7.x86_64 LIS#1
[   71.725067] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090007  05/18/2018
[   71.725067] Workqueue: kblockd blk_mq_timeout_work
[   71.725067] Call Trace:
[   71.725067]  [<ffffffff8cf62e41>] dump_stack+0x19/0x1b
[   71.725067]  [<ffffffff8c897688>] __warn+0xd8/0x100
[   71.725067]  [<ffffffff8c89770f>] warn_slowpath_fmt+0x5f/0x80
[   71.725067]  [<ffffffff8c94ac83>] __free_irq+0xb3/0x280
[   71.725067]  [<ffffffff8c94aed9>] free_irq+0x39/0x90
[   71.725067]  [<ffffffffc046b33c>] nvme_dev_disable+0x11c/0x4b0 [nvme]
[   71.725067]  [<ffffffff8cca465c>] ? dev_warn+0x6c/0x90
[   71.725067]  [<ffffffffc046bb34>] nvme_timeout+0x204/0x2d0 [nvme]
[   71.725067]  [<ffffffff8cb55c6d>] ? blk_mq_do_dispatch_sched+0x9d/0x130
[   71.725067]  [<ffffffff8c8e015c>] ? update_curr+0x14c/0x1e0
[   71.725067]  [<ffffffff8cb505a2>] blk_mq_rq_timed_out+0x32/0x80
[   71.725067]  [<ffffffff8cb5064c>] blk_mq_check_expired+0x5c/0x60
[   71.725067]  [<ffffffff8cb53924>] bt_iter+0x54/0x60
[   71.725067]  [<ffffffff8cb5425b>] blk_mq_queue_tag_busy_iter+0x11b/0x290
[   71.725067]  [<ffffffff8cb505f0>] ? blk_mq_rq_timed_out+0x80/0x80
[   71.725067]  [<ffffffff8cb505f0>] ? blk_mq_rq_timed_out+0x80/0x80
[   71.725067]  [<ffffffff8cb4f1db>] blk_mq_timeout_work+0x8b/0x180
[   71.725067]  [<ffffffff8c8b9d8f>] process_one_work+0x17f/0x440
[   71.725067]  [<ffffffff8c8bae26>] worker_thread+0x126/0x3c0
[   71.725067]  [<ffffffff8c8bad00>] ? manage_workers.isra.25+0x2a0/0x2a0
[   71.725067]  [<ffffffff8c8c1c71>] kthread+0xd1/0xe0
[   71.725067]  [<ffffffff8c8c1ba0>] ? insert_kthread_work+0x40/0x40
[   71.725067]  [<ffffffff8cf75c24>] ret_from_fork_nospec_begin+0xe/0x21
[   71.725067]  [<ffffffff8c8c1ba0>] ? insert_kthread_work+0x40/0x40
[   71.725067] ---[ end trace b3257623bc50d02a ]---
[   72.196556] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048
[   72.211013] IP: [<ffffffff8c94aed9>] free_irq+0x39/0x90

It looks the bug is more easily triggered when the VM has a lot of
vCPUs, e.g. L64v2 or L80v2 VM sizes. Presumably, in such a VM, the NVMe
driver can pass a "mask" which has multiple bits of 1, but is not equal
to "cpu_online_mask". Previously we incorrctly assumed the "mask" either
contains only 1 bit of "1" or equals to "cpu_online_mask".

Fixes: 9c8bbae ("RH7: PCI: hv: respect the affinity setting")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
  • Loading branch information
dcui committed May 22, 2019
1 parent 1673af8 commit 259bb45
Showing 1 changed file with 3 additions and 8 deletions.
11 changes: 3 additions & 8 deletions hv-rhel7.x/hv/pci-hyperv.c
Original file line number Diff line number Diff line change
Expand Up @@ -811,12 +811,11 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
}

/* Interrupt management hooks */
static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask,
static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
bool force)
{
struct msi_desc *msi_desc = data->msi_desc;
struct irq_cfg *cfg = irqd_cfg(data);
const struct cpumask *dest;
struct retarget_msi_interrupt *params;
struct hv_pcibus_device *hbus;
struct pci_bus *pbus;
Expand All @@ -827,10 +826,6 @@ static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask,
u64 res;
u32 var_size = 0;

if (cpumask_equal(mask, cpu_online_mask))
dest = cfg->domain;
else
dest = mask;
ret = __ioapic_set_affinity(data, dest, &dest_id);
if (ret)
return ret;
Expand Down Expand Up @@ -880,7 +875,7 @@ static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask,
*/
var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;

for_each_cpu_and(cpu, dest, cpu_online_mask) {
for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) {
cpu_vmbus = hv_cpu_number_to_vp_number(cpu);

if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
Expand All @@ -894,7 +889,7 @@ static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask,
(1ULL << (cpu_vmbus & 63));
}
} else {
for_each_cpu_and(cpu, dest, cpu_online_mask) {
for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) {
params->int_target.vp_mask |=
(1ULL << hv_cpu_number_to_vp_number(cpu));
}
Expand Down

0 comments on commit 259bb45

Please sign in to comment.