Skip to content

Commit 8b27ee6

Browse files
author
Alex Williamson
committed
vfio-pci: PCI hot reset interface
The current VFIO_DEVICE_RESET interface only maps to PCI use cases where we can isolate the reset to the individual PCI function. This means the device must support FLR (PCIe or AF), PM reset on D3hot->D0 transition, device specific reset, or be a singleton device on a bus for a secondary bus reset. FLR does not have widespread support, PM reset is not very reliable, and bus topology is dictated by the system and device design. We need to provide a means for a user to induce a bus reset in cases where the existing mechanisms are not available or not reliable. This device specific extension to VFIO provides the user with this ability. Two new ioctls are introduced: - VFIO_DEVICE_PCI_GET_HOT_RESET_INFO - VFIO_DEVICE_PCI_HOT_RESET The first provides the user with information about the extent of devices affected by a hot reset. This is essentially a list of devices and the IOMMU groups they belong to. The user may then initiate a hot reset by calling the second ioctl. We must be careful that the user has ownership of all the affected devices found via the first ioctl, so the second ioctl takes a list of file descriptors for the VFIO groups affected by the reset. Each group must have IOMMU protection established for the ioctl to succeed. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
1 parent 3bc4f39 commit 8b27ee6

File tree

2 files changed

+323
-1
lines changed

2 files changed

+323
-1
lines changed

drivers/vfio/pci/vfio_pci.c

Lines changed: 285 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include <linux/device.h>
1515
#include <linux/eventfd.h>
16+
#include <linux/file.h>
1617
#include <linux/interrupt.h>
1718
#include <linux/iommu.h>
1819
#include <linux/module.h>
@@ -227,6 +228,110 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
227228
return 0;
228229
}
229230

231+
static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
232+
{
233+
(*(int *)data)++;
234+
return 0;
235+
}
236+
237+
struct vfio_pci_fill_info {
238+
int max;
239+
int cur;
240+
struct vfio_pci_dependent_device *devices;
241+
};
242+
243+
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
244+
{
245+
struct vfio_pci_fill_info *fill = data;
246+
struct iommu_group *iommu_group;
247+
248+
if (fill->cur == fill->max)
249+
return -EAGAIN; /* Something changed, try again */
250+
251+
iommu_group = iommu_group_get(&pdev->dev);
252+
if (!iommu_group)
253+
return -EPERM; /* Cannot reset non-isolated devices */
254+
255+
fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
256+
fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
257+
fill->devices[fill->cur].bus = pdev->bus->number;
258+
fill->devices[fill->cur].devfn = pdev->devfn;
259+
fill->cur++;
260+
iommu_group_put(iommu_group);
261+
return 0;
262+
}
263+
264+
struct vfio_pci_group_entry {
265+
struct vfio_group *group;
266+
int id;
267+
};
268+
269+
struct vfio_pci_group_info {
270+
int count;
271+
struct vfio_pci_group_entry *groups;
272+
};
273+
274+
static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
275+
{
276+
struct vfio_pci_group_info *info = data;
277+
struct iommu_group *group;
278+
int id, i;
279+
280+
group = iommu_group_get(&pdev->dev);
281+
if (!group)
282+
return -EPERM;
283+
284+
id = iommu_group_id(group);
285+
286+
for (i = 0; i < info->count; i++)
287+
if (info->groups[i].id == id)
288+
break;
289+
290+
iommu_group_put(group);
291+
292+
return (i == info->count) ? -EINVAL : 0;
293+
}
294+
295+
static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
296+
{
297+
for (; pdev; pdev = pdev->bus->self)
298+
if (pdev->bus == slot->bus)
299+
return (pdev->slot == slot);
300+
return false;
301+
}
302+
303+
struct vfio_pci_walk_info {
304+
int (*fn)(struct pci_dev *, void *data);
305+
void *data;
306+
struct pci_dev *pdev;
307+
bool slot;
308+
int ret;
309+
};
310+
311+
static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
312+
{
313+
struct vfio_pci_walk_info *walk = data;
314+
315+
if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
316+
walk->ret = walk->fn(pdev, walk->data);
317+
318+
return walk->ret;
319+
}
320+
321+
static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
322+
int (*fn)(struct pci_dev *,
323+
void *data), void *data,
324+
bool slot)
325+
{
326+
struct vfio_pci_walk_info walk = {
327+
.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
328+
};
329+
330+
pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
331+
332+
return walk.ret;
333+
}
334+
230335
static long vfio_pci_ioctl(void *device_data,
231336
unsigned int cmd, unsigned long arg)
232337
{
@@ -407,10 +512,189 @@ static long vfio_pci_ioctl(void *device_data,
407512

408513
return ret;
409514

410-
} else if (cmd == VFIO_DEVICE_RESET)
515+
} else if (cmd == VFIO_DEVICE_RESET) {
411516
return vdev->reset_works ?
412517
pci_reset_function(vdev->pdev) : -EINVAL;
413518

519+
} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
520+
struct vfio_pci_hot_reset_info hdr;
521+
struct vfio_pci_fill_info fill = { 0 };
522+
struct vfio_pci_dependent_device *devices = NULL;
523+
bool slot = false;
524+
int ret = 0;
525+
526+
minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
527+
528+
if (copy_from_user(&hdr, (void __user *)arg, minsz))
529+
return -EFAULT;
530+
531+
if (hdr.argsz < minsz)
532+
return -EINVAL;
533+
534+
hdr.flags = 0;
535+
536+
/* Can we do a slot or bus reset or neither? */
537+
if (!pci_probe_reset_slot(vdev->pdev->slot))
538+
slot = true;
539+
else if (pci_probe_reset_bus(vdev->pdev->bus))
540+
return -ENODEV;
541+
542+
/* How many devices are affected? */
543+
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
544+
vfio_pci_count_devs,
545+
&fill.max, slot);
546+
if (ret)
547+
return ret;
548+
549+
WARN_ON(!fill.max); /* Should always be at least one */
550+
551+
/*
552+
* If there's enough space, fill it now, otherwise return
553+
* -ENOSPC and the number of devices affected.
554+
*/
555+
if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
556+
ret = -ENOSPC;
557+
hdr.count = fill.max;
558+
goto reset_info_exit;
559+
}
560+
561+
devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
562+
if (!devices)
563+
return -ENOMEM;
564+
565+
fill.devices = devices;
566+
567+
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
568+
vfio_pci_fill_devs,
569+
&fill, slot);
570+
571+
/*
572+
* If a device was removed between counting and filling,
573+
* we may come up short of fill.max. If a device was
574+
* added, we'll have a return of -EAGAIN above.
575+
*/
576+
if (!ret)
577+
hdr.count = fill.cur;
578+
579+
reset_info_exit:
580+
if (copy_to_user((void __user *)arg, &hdr, minsz))
581+
ret = -EFAULT;
582+
583+
if (!ret) {
584+
if (copy_to_user((void __user *)(arg + minsz), devices,
585+
hdr.count * sizeof(*devices)))
586+
ret = -EFAULT;
587+
}
588+
589+
kfree(devices);
590+
return ret;
591+
592+
} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
593+
struct vfio_pci_hot_reset hdr;
594+
int32_t *group_fds;
595+
struct vfio_pci_group_entry *groups;
596+
struct vfio_pci_group_info info;
597+
bool slot = false;
598+
int i, count = 0, ret = 0;
599+
600+
minsz = offsetofend(struct vfio_pci_hot_reset, count);
601+
602+
if (copy_from_user(&hdr, (void __user *)arg, minsz))
603+
return -EFAULT;
604+
605+
if (hdr.argsz < minsz || hdr.flags)
606+
return -EINVAL;
607+
608+
/* Can we do a slot or bus reset or neither? */
609+
if (!pci_probe_reset_slot(vdev->pdev->slot))
610+
slot = true;
611+
else if (pci_probe_reset_bus(vdev->pdev->bus))
612+
return -ENODEV;
613+
614+
/*
615+
* We can't let userspace give us an arbitrarily large
616+
* buffer to copy, so verify how many we think there
617+
* could be. Note groups can have multiple devices so
618+
* one group per device is the max.
619+
*/
620+
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
621+
vfio_pci_count_devs,
622+
&count, slot);
623+
if (ret)
624+
return ret;
625+
626+
/* Somewhere between 1 and count is OK */
627+
if (!hdr.count || hdr.count > count)
628+
return -EINVAL;
629+
630+
group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
631+
groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
632+
if (!group_fds || !groups) {
633+
kfree(group_fds);
634+
kfree(groups);
635+
return -ENOMEM;
636+
}
637+
638+
if (copy_from_user(group_fds, (void __user *)(arg + minsz),
639+
hdr.count * sizeof(*group_fds))) {
640+
kfree(group_fds);
641+
kfree(groups);
642+
return -EFAULT;
643+
}
644+
645+
/*
646+
* For each group_fd, get the group through the vfio external
647+
* user interface and store the group and iommu ID. This
648+
* ensures the group is held across the reset.
649+
*/
650+
for (i = 0; i < hdr.count; i++) {
651+
struct vfio_group *group;
652+
struct fd f = fdget(group_fds[i]);
653+
if (!f.file) {
654+
ret = -EBADF;
655+
break;
656+
}
657+
658+
group = vfio_group_get_external_user(f.file);
659+
fdput(f);
660+
if (IS_ERR(group)) {
661+
ret = PTR_ERR(group);
662+
break;
663+
}
664+
665+
groups[i].group = group;
666+
groups[i].id = vfio_external_user_iommu_id(group);
667+
}
668+
669+
kfree(group_fds);
670+
671+
/* release reference to groups on error */
672+
if (ret)
673+
goto hot_reset_release;
674+
675+
info.count = hdr.count;
676+
info.groups = groups;
677+
678+
/*
679+
* Test whether all the affected devices are contained
680+
* by the set of groups provided by the user.
681+
*/
682+
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
683+
vfio_pci_validate_devs,
684+
&info, slot);
685+
if (!ret)
686+
/* User has access, do the reset */
687+
ret = slot ? pci_reset_slot(vdev->pdev->slot) :
688+
pci_reset_bus(vdev->pdev->bus);
689+
690+
hot_reset_release:
691+
for (i--; i >= 0; i--)
692+
vfio_group_put_external_user(groups[i].group);
693+
694+
kfree(groups);
695+
return ret;
696+
}
697+
414698
return -ENOTTY;
415699
}
416700

include/uapi/linux/vfio.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,44 @@ enum {
324324
VFIO_PCI_NUM_IRQS
325325
};
326326

327+
/**
328+
* VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
329+
* struct vfio_pci_hot_reset_info)
330+
*
331+
* Return: 0 on success, -errno on failure:
332+
* -enospc = insufficient buffer, -enodev = unsupported for device.
333+
*/
334+
struct vfio_pci_dependent_device {
335+
__u32 group_id;
336+
__u16 segment;
337+
__u8 bus;
338+
__u8 devfn; /* Use PCI_SLOT/PCI_FUNC */
339+
};
340+
341+
struct vfio_pci_hot_reset_info {
342+
__u32 argsz;
343+
__u32 flags;
344+
__u32 count;
345+
struct vfio_pci_dependent_device devices[];
346+
};
347+
348+
#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
349+
350+
/**
351+
* VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
352+
* struct vfio_pci_hot_reset)
353+
*
354+
* Return: 0 on success, -errno on failure.
355+
*/
356+
struct vfio_pci_hot_reset {
357+
__u32 argsz;
358+
__u32 flags;
359+
__u32 count;
360+
__s32 group_fds[];
361+
};
362+
363+
#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
364+
327365
/* -------- API for Type1 VFIO IOMMU -------- */
328366

329367
/**

0 commit comments

Comments
 (0)