Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Commit

Permalink
In WriteThru and WriteAround modes, fall back to disk on SSD errors.
Browse files Browse the repository at this point in the history
Summary: Committing an old patch to handle SSD errors in writethru and writearound
modes to fall back to disk instead of returning errors back to the user. This path
needs to be extended for writeback as well (failed reads on a !DIRTY block and failed
new writes can be similarly handled. Thanks to Arne Wiebalck for providing the impetus
for this change, reworking my initial attempt and testing the patch in production.

Test Plan:

Reviewers:

CC:

Task ID: #

Blame Rev:
  • Loading branch information
Mohan Srinivasan committed Oct 25, 2012
1 parent e0724aa commit 6aaf44a
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 6 deletions.
7 changes: 7 additions & 0 deletions src/flashcache.h
Expand Up @@ -270,6 +270,13 @@ struct cache_c {
char cache_devname[DEV_PATHLEN];
char disk_devname[DEV_PATHLEN];

/*
* If the SSD returns errors, in WRITETHRU and WRITEAROUND modes,
* bypass the cache completely. If the SSD dies or is removed,
* we want to continue sending requests to the device.
*/
int bypass_cache;

/* Per device sysctls */
int sysctl_io_latency_hist;
int sysctl_do_sync;
Expand Down
6 changes: 5 additions & 1 deletion src/flashcache_ioctl.c
Expand Up @@ -311,7 +311,11 @@ int
flashcache_uncacheable(struct cache_c *dmc, struct bio *bio)
{
int dontcache;


if (unlikely(dmc->bypass_cache)) {
dontcache = 1;
goto out;
}
if (dmc->sysctl_cache_all) {
/* If the tid has been blacklisted, we don't cache at all.
This overrides everything else */
Expand Down
65 changes: 60 additions & 5 deletions src/flashcache_main.c
Expand Up @@ -188,6 +188,11 @@ flashcache_io_callback(unsigned long error, void *context)
error = -EIO;
DMERR("flashcache_io_callback: io error %ld block %lu action %d",
error, job->job_io_regions.disk.sector, job->action);
if (!dmc->bypass_cache && dmc->cache_mode != FLASHCACHE_WRITE_BACK) {
DMERR("flashcache_io_callback: switching %s to BYPASS mode",
dmc->cache_devname);
dmc->bypass_cache = 1;
}
}
job->error = error;
switch (job->action) {
Expand Down Expand Up @@ -295,7 +300,19 @@ flashcache_io_callback(unsigned long error, void *context)
}
break;
}
flashcache_bio_endio(bio, error, dmc, &job->io_start_time);
/*
* If we get an error in write through || write around modes,
* we try the disk directly, after invalidating the cached block.
* see flashcache_do_pending_error().
* XXX - We can do the same for writeback as well. But that is more
* work. (a) we cannot fall back to disk when a ssd read of a dirty
* cacheblock fails (b) we'd need to handle ssd metadata write
* failures as well and fall back to disk in those cases as well.
*/
if (likely(error == 0) || (dmc->cache_mode == FLASHCACHE_WRITE_BACK)) {
flashcache_bio_endio(bio, error, dmc, &job->io_start_time);
job->bio = NULL;
}
/*
* The INPROG flag is still set. We cannot turn that off until all the pending requests
* processed. We need to loop the pending requests back to a workqueue. We have the job,
Expand Down Expand Up @@ -346,9 +363,14 @@ flashcache_do_pending_error(struct kcached_job *job)
struct cache_c *dmc = job->dmc;
unsigned long flags;
struct cacheblock *cacheblk = &dmc->cache[job->index];
struct bio *bio = job->bio;
int error = job->error;
struct pending_job *pjob_list = NULL, *pjob = NULL;

DMERR("flashcache_do_pending_error: error %d block %lu action %d",
job->error, job->job_io_regions.disk.sector, job->action);
if (!dmc->bypass_cache) {
DMERR("flashcache_do_pending_error: error %d block %lu action %d",
job->error, job->job_io_regions.disk.sector, job->action);
}
spin_lock_irqsave(&dmc->cache_spin_lock, flags);
VERIFY(cacheblk->cache_state & VALID);
/* Invalidate block if possible */
Expand All @@ -357,10 +379,43 @@ flashcache_do_pending_error(struct kcached_job *job)
dmc->flashcache_stats.pending_inval++;
cacheblk->cache_state &= ~VALID;
cacheblk->cache_state |= INVALID;
}
flashcache_free_pending_jobs(dmc, cacheblk, job->error);
} else
VERIFY(dmc->cache_mode == FLASHCACHE_WRITE_BACK);
cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
/*
* In case of an error in writeback or writearound modes, if there
* are pending jobs, de-link them from the cacheblock so we can issue disk
* IOs below.
*/
if (bio != NULL) {
VERIFY(dmc->cache_mode != FLASHCACHE_WRITE_BACK);
pjob_list = flashcache_deq_pending(dmc, cacheblk - &dmc->cache[0]);
for (pjob = pjob_list ; pjob != NULL ; pjob = pjob->next) {
VERIFY(cacheblk->nr_queued > 0);
cacheblk->nr_queued--;
}
VERIFY(cacheblk->nr_queued == 0);
} else
flashcache_free_pending_jobs(dmc, cacheblk, job->error);
spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
if (bio != NULL) {
/*
* Cache (read/write) error in write through or write around
* mode. Issue the IO directly to disk. We've already invalidated
* the cache block above.
*/
if (!dmc->bypass_cache) /* suppress massive console output */
DMERR("flashcache_do_pending_error: Re-launching errored IO"
"to disk, after io error %d block %lu",
error, bio->bi_sector);
flashcache_start_uncached_io(dmc, bio);
while (pjob_list != NULL) {
pjob = pjob_list;
pjob_list = pjob->next;
flashcache_start_uncached_io(dmc, pjob->bio);
flashcache_free_pending_job(pjob);
}
}
flashcache_free_cache_job(job);
if (atomic_dec_and_test(&dmc->nr_jobs))
wake_up(&dmc->destroyq);
Expand Down

0 comments on commit 6aaf44a

Please sign in to comment.