Skip to content

Commit

Permalink
nvme: Fix locking protocol violation to fix suspend / resume
Browse files Browse the repository at this point in the history
Currently, when we suspend, we need to tear down all the qpairs. We call
nvme_admin_qpair_abort_aers with the admin qpair lock held, but the
tracker it will call for the pending AER also locks it (recursively)
hitting an assert. This routine is called without the qpair lock held
when we destroy the device entirely in a number of places. Add an assert
to this effect and drop the qpair lock before calling it.
nvme_admin_qpair_abort_aers then locks the qpair lock to traverse the
list, dropping it around calls to nvme_qpair_complete_tracker, and
restarting the list scan after picking it back up.

Note: If interrupts are still running, there's a tiny window for these
AERs: If one fires just an instant after we manually complete it, then
we'll be fine: we set the state of the queue to 'waiting' and we ignore
interrupts while 'waiting'. We know we'll destroy all the queue state
with these pending interrupts before looking at them again and we know
all the TRs will have been completed or rescheduled. So either way we're
covered.

Also, tidy up the failure case as well: failing a queue is a superset of
disabling it, so no need to call disable first. This solves solves some
locking issues with recursion since we don't need to recurse.. Set the
qpair state of failed queues to RECOVERY_FAILED and stop scheduling the
watchdog. Assert we're not failed when we're enabling a qpair, since
failure currently is one-way. Make failure a little less verbose.

Next, kill the pre/post reset stuff. It's completely bogus since we
disable the qparis, we don't need to also hold the lock through the
reset: disabling will cause the ISR to return early. This keeps us from
recursing on the recovery lock when resuming. We only need the recovery
lock to avoid a specific race between the timer and the ISR.

Finally, kill NVME_RESET_2X. It'S been a major release since we put it
in and nobody has used it as far as I can tell. And it was a motivator
for the pre/post uglification.

These are all interrelated, so need to be done at the same time.

Sponsored by:		Netflix
Reviewed by:		jhb
Tested by:		jhb (made sure suspend / resume worked)
MFC After:		3 days
Differential Revision:	https://reviews.freebsd.org/D41866

(cherry picked from commit da8324a)
(cherry picked from commit 81b118e)

Approved-by: re (cperciva)
  • Loading branch information
bsdimp committed Sep 28, 2023
1 parent b2bcb31 commit 001c45a
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 59 deletions.
49 changes: 5 additions & 44 deletions sys/dev/nvme/nvme_ctrlr.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,15 @@ nvme_ctrlr_fail(struct nvme_controller *ctrlr)
{
int i;

/*
* No need to disable queues before failing them. Failing is a superet
* of disabling (though pedantically we'd abort the AERs silently with
* a different error, though when we fail, that hardly matters).
*/
ctrlr->is_failed = true;
nvme_admin_qpair_disable(&ctrlr->adminq);
nvme_qpair_fail(&ctrlr->adminq);
if (ctrlr->ioq != NULL) {
for (i = 0; i < ctrlr->num_io_queues; i++) {
nvme_io_qpair_disable(&ctrlr->ioq[i]);
nvme_qpair_fail(&ctrlr->ioq[i]);
}
}
Expand Down Expand Up @@ -416,34 +419,6 @@ nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
}
}

static void
nvme_pre_reset(struct nvme_controller *ctrlr)
{
/*
* Make sure that all the ISRs are done before proceeding with the reset
* (and also keep any stray interrupts that happen during this process
* from racing this process). For startup, this is a nop, since the
* hardware is in a good state. But for recovery, where we randomly
* reset the hardware, this ensure that we're not racing the ISRs.
*/
mtx_lock(&ctrlr->adminq.recovery);
for (int i = 0; i < ctrlr->num_io_queues; i++) {
mtx_lock(&ctrlr->ioq[i].recovery);
}
}

static void
nvme_post_reset(struct nvme_controller *ctrlr)
{
/*
* Reset complete, unblock ISRs
*/
mtx_unlock(&ctrlr->adminq.recovery);
for (int i = 0; i < ctrlr->num_io_queues; i++) {
mtx_unlock(&ctrlr->ioq[i].recovery);
}
}

static int
nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
{
Expand Down Expand Up @@ -1236,9 +1211,7 @@ nvme_ctrlr_reset_task(void *arg, int pending)
int status;

nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
nvme_pre_reset(ctrlr);
status = nvme_ctrlr_hw_reset(ctrlr);
nvme_post_reset(ctrlr);
if (status == 0)
nvme_ctrlr_start(ctrlr, true);
else
Expand Down Expand Up @@ -1725,19 +1698,8 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
if (ctrlr->is_failed)
return (0);

nvme_pre_reset(ctrlr);
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#ifdef NVME_2X_RESET
/*
* Prior to FreeBSD 13.1, FreeBSD's nvme driver reset the hardware twice
* to get it into a known good state. However, the hardware's state is
* good and we don't need to do this for proper functioning.
*/
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#endif
nvme_post_reset(ctrlr);

/*
* Now that we've reset the hardware, we can restart the controller. Any
Expand All @@ -1754,7 +1716,6 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
* the controller. However, we have to return success for the resume
* itself, due to questionable APIs.
*/
nvme_post_reset(ctrlr);
nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
nvme_ctrlr_fail(ctrlr);
(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
Expand Down
1 change: 1 addition & 0 deletions sys/dev/nvme/nvme_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ struct nvme_tracker {
enum nvme_recovery {
RECOVERY_NONE = 0, /* Normal operations */
RECOVERY_WAITING, /* waiting for the reset to complete */
RECOVERY_FAILED, /* We have failed, no more I/O */
};
struct nvme_qpair {
struct nvme_controller *ctrlr;
Expand Down
51 changes: 36 additions & 15 deletions sys/dev/nvme/nvme_qpair.c
Original file line number Diff line number Diff line change
Expand Up @@ -945,22 +945,38 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr;

/*
* nvme_complete_tracker must be called without the qpair lock held. It
* takes the lock to adjust outstanding_tr list, so make sure we don't
* have it yet (since this is a general purpose routine). We take the
* lock to make the list traverse safe, but have to drop the lock to
* complete any AER. We restart the list scan when we do this to make
* this safe. There's interlock with the ISR so we know this tracker
* won't be completed twice.
*/
mtx_assert(&qpair->lock, MA_NOTOWNED);

mtx_lock(&qpair->lock);
tr = TAILQ_FIRST(&qpair->outstanding_tr);
while (tr != NULL) {
if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
mtx_unlock(&qpair->lock);
nvme_qpair_manual_complete_tracker(tr,
NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
ERROR_PRINT_NONE);
mtx_lock(&qpair->lock);
tr = TAILQ_FIRST(&qpair->outstanding_tr);
} else {
tr = TAILQ_NEXT(tr, tailq);
}
}
mtx_unlock(&qpair->lock);
}

void
nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
{
mtx_assert(&qpair->lock, MA_NOTOWNED);

nvme_admin_qpair_abort_aers(qpair);
nvme_qpair_destroy(qpair);
Expand Down Expand Up @@ -1011,17 +1027,6 @@ nvme_qpair_timeout(void *arg)

mtx_assert(&qpair->recovery, MA_OWNED);

/*
* If the controller has failed, give up. We're never going to change
* state from a failed controller: no further transactions are possible.
* We go ahead and let the timeout expire in many cases for simplicity.
*/
if (qpair->ctrlr->is_failed) {
nvme_printf(ctrlr, "Controller failed, giving up\n");
qpair->timer_armed = false;
return;
}

switch (qpair->recovery_state) {
case RECOVERY_NONE:
/*
Expand Down Expand Up @@ -1108,6 +1113,11 @@ nvme_qpair_timeout(void *arg)
nvme_printf(ctrlr, "Waiting for reset to complete\n");
idle = false; /* We want to keep polling */
break;
case RECOVERY_FAILED:
KASSERT(qpair->ctrlr->is_failed,
("Recovery state failed w/o failed controller\n"));
idle = true; /* nothing to monitor */
break;
}

/*
Expand Down Expand Up @@ -1297,6 +1307,8 @@ nvme_qpair_enable(struct nvme_qpair *qpair)
mtx_assert(&qpair->recovery, MA_OWNED);
if (mtx_initialized(&qpair->lock))
mtx_assert(&qpair->lock, MA_OWNED);
KASSERT(qpair->recovery_state != RECOVERY_FAILED,
("Enabling a failed qpair\n"));

qpair->recovery_state = RECOVERY_NONE;
}
Expand Down Expand Up @@ -1421,12 +1433,13 @@ void
nvme_admin_qpair_disable(struct nvme_qpair *qpair)
{
mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);

mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
mtx_unlock(&qpair->lock);

nvme_admin_qpair_abort_aers(qpair);

mtx_unlock(&qpair->lock);
mtx_unlock(&qpair->recovery);
}

Expand All @@ -1451,26 +1464,34 @@ nvme_qpair_fail(struct nvme_qpair *qpair)
if (!mtx_initialized(&qpair->lock))
return;

mtx_lock(&qpair->recovery);
qpair->recovery_state = RECOVERY_FAILED;
mtx_unlock(&qpair->recovery);

mtx_lock(&qpair->lock);

if (!STAILQ_EMPTY(&qpair->queued_req)) {
nvme_printf(qpair->ctrlr, "failing queued i/o\n");
}
while (!STAILQ_EMPTY(&qpair->queued_req)) {
req = STAILQ_FIRST(&qpair->queued_req);
STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
nvme_printf(qpair->ctrlr, "failing queued i/o\n");
mtx_unlock(&qpair->lock);
nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
NVME_SC_ABORTED_BY_REQUEST);
mtx_lock(&qpair->lock);
}

if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
}
/* Manually abort each outstanding I/O. */
while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
tr = TAILQ_FIRST(&qpair->outstanding_tr);
/*
* Do not remove the tracker. The abort_tracker path will
* do that for us.
*/
nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
mtx_unlock(&qpair->lock);
nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
Expand Down

0 comments on commit 001c45a

Please sign in to comment.