@@ -489,7 +489,10 @@ struct hv_pcibus_device {
489489 struct fwnode_handle * fwnode ;
490490 /* Protocol version negotiated with the host */
491491 enum pci_protocol_version_t protocol_version ;
492+
493+ struct mutex state_lock ;
492494 enum hv_pcibus_state state ;
495+
493496 struct hv_device * hdev ;
494497 resource_size_t low_mmio_space ;
495498 resource_size_t high_mmio_space ;
@@ -553,19 +556,10 @@ struct hv_dr_state {
553556 struct hv_pcidev_description func [];
554557};
555558
556- enum hv_pcichild_state {
557- hv_pcichild_init = 0 ,
558- hv_pcichild_requirements ,
559- hv_pcichild_resourced ,
560- hv_pcichild_ejecting ,
561- hv_pcichild_maximum
562- };
563-
564559struct hv_pci_dev {
565560 /* List protected by pci_rescan_remove_lock */
566561 struct list_head list_entry ;
567562 refcount_t refs ;
568- enum hv_pcichild_state state ;
569563 struct pci_slot * pci_slot ;
570564 struct hv_pcidev_description desc ;
571565 bool reported_missing ;
@@ -643,6 +637,11 @@ static void hv_arch_irq_unmask(struct irq_data *data)
643637 pbus = pdev -> bus ;
644638 hbus = container_of (pbus -> sysdata , struct hv_pcibus_device , sysdata );
645639 int_desc = data -> chip_data ;
640+ if (!int_desc ) {
641+ dev_warn (& hbus -> hdev -> device , "%s() can not unmask irq %u\n" ,
642+ __func__ , data -> irq );
643+ return ;
644+ }
646645
647646 spin_lock_irqsave (& hbus -> retarget_msi_interrupt_lock , flags );
648647
@@ -1911,12 +1910,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
19111910 hv_pci_onchannelcallback (hbus );
19121911 spin_unlock_irqrestore (& channel -> sched_lock , flags );
19131912
1914- if (hpdev -> state == hv_pcichild_ejecting ) {
1915- dev_err_once (& hbus -> hdev -> device ,
1916- "the device is being ejected\n" );
1917- goto enable_tasklet ;
1918- }
1919-
19201913 udelay (100 );
19211914 }
19221915
@@ -2522,6 +2515,8 @@ static void pci_devices_present_work(struct work_struct *work)
25222515 if (!dr )
25232516 return ;
25242517
2518+ mutex_lock (& hbus -> state_lock );
2519+
25252520 /* First, mark all existing children as reported missing. */
25262521 spin_lock_irqsave (& hbus -> device_list_lock , flags );
25272522 list_for_each_entry (hpdev , & hbus -> children , list_entry ) {
@@ -2603,6 +2598,8 @@ static void pci_devices_present_work(struct work_struct *work)
26032598 break ;
26042599 }
26052600
2601+ mutex_unlock (& hbus -> state_lock );
2602+
26062603 kfree (dr );
26072604}
26082605
@@ -2751,7 +2748,7 @@ static void hv_eject_device_work(struct work_struct *work)
27512748 hpdev = container_of (work , struct hv_pci_dev , wrk );
27522749 hbus = hpdev -> hbus ;
27532750
2754- WARN_ON ( hpdev -> state != hv_pcichild_ejecting );
2751+ mutex_lock ( & hbus -> state_lock );
27552752
27562753 /*
27572754 * Ejection can come before or after the PCI bus has been set up, so
@@ -2789,6 +2786,8 @@ static void hv_eject_device_work(struct work_struct *work)
27892786 put_pcichild (hpdev );
27902787 put_pcichild (hpdev );
27912788 /* hpdev has been freed. Do not use it any more. */
2789+
2790+ mutex_unlock (& hbus -> state_lock );
27922791}
27932792
27942793/**
@@ -2809,7 +2808,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
28092808 return ;
28102809 }
28112810
2812- hpdev -> state = hv_pcichild_ejecting ;
28132811 get_pcichild (hpdev );
28142812 INIT_WORK (& hpdev -> wrk , hv_eject_device_work );
28152813 queue_work (hbus -> wq , & hpdev -> wrk );
@@ -3238,8 +3236,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev)
32383236 struct pci_bus_d0_entry * d0_entry ;
32393237 struct hv_pci_compl comp_pkt ;
32403238 struct pci_packet * pkt ;
3239+ bool retry = true;
32413240 int ret ;
32423241
3242+ enter_d0_retry :
32433243 /*
32443244 * Tell the host that the bus is ready to use, and moved into the
32453245 * powered-on state. This includes telling the host which region
@@ -3266,6 +3266,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev)
32663266 if (ret )
32673267 goto exit ;
32683268
3269+ /*
3270+ * In certain case (Kdump) the pci device of interest was
3271+ * not cleanly shut down and resource is still held on host
3272+ * side, the host could return invalid device status.
3273+ * We need to explicitly request host to release the resource
3274+ * and try to enter D0 again.
3275+ */
3276+ if (comp_pkt .completion_status < 0 && retry ) {
3277+ retry = false;
3278+
3279+ dev_err (& hdev -> device , "Retrying D0 Entry\n" );
3280+
3281+ /*
3282+ * Hv_pci_bus_exit() calls hv_send_resource_released()
3283+ * to free up resources of its child devices.
3284+ * In the kdump kernel we need to set the
3285+ * wslot_res_allocated to 255 so it scans all child
3286+ * devices to release resources allocated in the
3287+ * normal kernel before panic happened.
3288+ */
3289+ hbus -> wslot_res_allocated = 255 ;
3290+
3291+ ret = hv_pci_bus_exit (hdev , true);
3292+
3293+ if (ret == 0 ) {
3294+ kfree (pkt );
3295+ goto enter_d0_retry ;
3296+ }
3297+ dev_err (& hdev -> device ,
3298+ "Retrying D0 failed with ret %d\n" , ret );
3299+ }
3300+
32693301 if (comp_pkt .completion_status < 0 ) {
32703302 dev_err (& hdev -> device ,
32713303 "PCI Pass-through VSP failed D0 Entry with status %x\n" ,
@@ -3308,6 +3340,24 @@ static int hv_pci_query_relations(struct hv_device *hdev)
33083340 if (!ret )
33093341 ret = wait_for_response (hdev , & comp );
33103342
3343+ /*
3344+ * In the case of fast device addition/removal, it's possible that
3345+ * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we
3346+ * already got a PCI_BUS_RELATIONS* message from the host and the
3347+ * channel callback already scheduled a work to hbus->wq, which can be
3348+ * running pci_devices_present_work() -> survey_child_resources() ->
3349+ * complete(&hbus->survey_event), even after hv_pci_query_relations()
3350+ * exits and the stack variable 'comp' is no longer valid; as a result,
3351+ * a hang or a page fault may happen when the complete() calls
3352+ * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from
3353+ * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is
3354+ * -ENODEV, there can't be any more work item scheduled to hbus->wq
3355+ * after the flush_workqueue(): see vmbus_onoffer_rescind() ->
3356+ * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() ->
3357+ * channel->rescind = true.
3358+ */
3359+ flush_workqueue (hbus -> wq );
3360+
33113361 return ret ;
33123362}
33133363
@@ -3493,7 +3543,6 @@ static int hv_pci_probe(struct hv_device *hdev,
34933543 struct hv_pcibus_device * hbus ;
34943544 u16 dom_req , dom ;
34953545 char * name ;
3496- bool enter_d0_retry = true;
34973546 int ret ;
34983547
34993548 /*
@@ -3529,6 +3578,7 @@ static int hv_pci_probe(struct hv_device *hdev,
35293578 return - ENOMEM ;
35303579
35313580 hbus -> bridge = bridge ;
3581+ mutex_init (& hbus -> state_lock );
35323582 hbus -> state = hv_pcibus_init ;
35333583 hbus -> wslot_res_allocated = -1 ;
35343584
@@ -3633,49 +3683,15 @@ static int hv_pci_probe(struct hv_device *hdev,
36333683 if (ret )
36343684 goto free_fwnode ;
36353685
3636- retry :
36373686 ret = hv_pci_query_relations (hdev );
36383687 if (ret )
36393688 goto free_irq_domain ;
36403689
3641- ret = hv_pci_enter_d0 (hdev );
3642- /*
3643- * In certain case (Kdump) the pci device of interest was
3644- * not cleanly shut down and resource is still held on host
3645- * side, the host could return invalid device status.
3646- * We need to explicitly request host to release the resource
3647- * and try to enter D0 again.
3648- * Since the hv_pci_bus_exit() call releases structures
3649- * of all its child devices, we need to start the retry from
3650- * hv_pci_query_relations() call, requesting host to send
3651- * the synchronous child device relations message before this
3652- * information is needed in hv_send_resources_allocated()
3653- * call later.
3654- */
3655- if (ret == - EPROTO && enter_d0_retry ) {
3656- enter_d0_retry = false;
3657-
3658- dev_err (& hdev -> device , "Retrying D0 Entry\n" );
3659-
3660- /*
3661- * Hv_pci_bus_exit() calls hv_send_resources_released()
3662- * to free up resources of its child devices.
3663- * In the kdump kernel we need to set the
3664- * wslot_res_allocated to 255 so it scans all child
3665- * devices to release resources allocated in the
3666- * normal kernel before panic happened.
3667- */
3668- hbus -> wslot_res_allocated = 255 ;
3669- ret = hv_pci_bus_exit (hdev , true);
3670-
3671- if (ret == 0 )
3672- goto retry ;
3690+ mutex_lock (& hbus -> state_lock );
36733691
3674- dev_err (& hdev -> device ,
3675- "Retrying D0 failed with ret %d\n" , ret );
3676- }
3692+ ret = hv_pci_enter_d0 (hdev );
36773693 if (ret )
3678- goto free_irq_domain ;
3694+ goto release_state_lock ;
36793695
36803696 ret = hv_pci_allocate_bridge_windows (hbus );
36813697 if (ret )
@@ -3693,12 +3709,15 @@ static int hv_pci_probe(struct hv_device *hdev,
36933709 if (ret )
36943710 goto free_windows ;
36953711
3712+ mutex_unlock (& hbus -> state_lock );
36963713 return 0 ;
36973714
36983715free_windows :
36993716 hv_pci_free_bridge_windows (hbus );
37003717exit_d0 :
37013718 (void ) hv_pci_bus_exit (hdev , true);
3719+ release_state_lock :
3720+ mutex_unlock (& hbus -> state_lock );
37023721free_irq_domain :
37033722 irq_domain_remove (hbus -> irq_domain );
37043723free_fwnode :
@@ -3948,20 +3967,26 @@ static int hv_pci_resume(struct hv_device *hdev)
39483967 if (ret )
39493968 goto out ;
39503969
3970+ mutex_lock (& hbus -> state_lock );
3971+
39513972 ret = hv_pci_enter_d0 (hdev );
39523973 if (ret )
3953- goto out ;
3974+ goto release_state_lock ;
39543975
39553976 ret = hv_send_resources_allocated (hdev );
39563977 if (ret )
3957- goto out ;
3978+ goto release_state_lock ;
39583979
39593980 prepopulate_bars (hbus );
39603981
39613982 hv_pci_restore_msi_state (hbus );
39623983
39633984 hbus -> state = hv_pcibus_installed ;
3985+ mutex_unlock (& hbus -> state_lock );
39643986 return 0 ;
3987+
3988+ release_state_lock :
3989+ mutex_unlock (& hbus -> state_lock );
39653990out :
39663991 vmbus_close (hdev -> channel );
39673992 return ret ;
0 commit comments