Skip to content

Commit 1e50915

Browse files
Robert Beckerneilbrown
authored andcommitted
raid: improve MD/raid10 handling of correctable read errors.
We've noticed severe lasting performance degradation of our raid arrays when we have drives that yield large amounts of media errors. The raid10 module will queue each failed read for retry, and also will attempt call fix_read_error() to perform the read recovery. Read recovery is performed while the array is frozen, so repeated recovery attempts can degrade the performance of the array for extended periods of time. With this patch I propose adding a per md device max number of corrected read attempts. Each rdev will maintain a count of read correction attempts in the rdev->read_errors field (not used currently for raid10). When we enter fix_read_error() we'll check to see when the last read error occurred, and divide the read error count by 2 for every hour since the last read error. If at that point our read error count exceeds the read error threshold, we'll fail the raid device. In addition in this patch I add sysfs nodes (get/set) for the per md max_read_errors attribute, the rdev->read_errors attribute, and added some printk's to indicate when fix_read_error fails to repair an rdev. For testing I used debugfs->fail_make_request to inject IO errors to the rdev while doing IO to the raid array. Signed-off-by: Robert Becker <Rob.Becker@riverbed.com> Signed-off-by: NeilBrown <neilb@suse.de>
1 parent 67b8dc4 commit 1e50915

File tree

3 files changed

+112
-0
lines changed

3 files changed

+112
-0
lines changed

drivers/md/md.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
6767

6868
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
6969

70+
/*
71+
* Default number of read corrections we'll attempt on an rdev
72+
* before ejecting it from the array. We divide the read error
73+
* count by 2 for every hour elapsed between read errors.
74+
*/
75+
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
7076
/*
7177
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
7278
* is 1000 KB/sec, so the extra system load does not show up that much.
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
26532659
rdev->flags = 0;
26542660
rdev->data_offset = 0;
26552661
rdev->sb_events = 0;
2662+
rdev->last_read_error.tv_sec = 0;
2663+
rdev->last_read_error.tv_nsec = 0;
26562664
atomic_set(&rdev->nr_pending, 0);
26572665
atomic_set(&rdev->read_errors, 0);
26582666
atomic_set(&rdev->corrected_errors, 0);
@@ -3289,6 +3297,29 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
32893297
static struct md_sysfs_entry md_array_state =
32903298
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
32913299

3300+
static ssize_t
3301+
max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3302+
return sprintf(page, "%d\n",
3303+
atomic_read(&mddev->max_corr_read_errors));
3304+
}
3305+
3306+
static ssize_t
3307+
max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3308+
{
3309+
char *e;
3310+
unsigned long n = simple_strtoul(buf, &e, 10);
3311+
3312+
if (*buf && (*e == 0 || *e == '\n')) {
3313+
atomic_set(&mddev->max_corr_read_errors, n);
3314+
return len;
3315+
}
3316+
return -EINVAL;
3317+
}
3318+
3319+
static struct md_sysfs_entry max_corr_read_errors =
3320+
__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3321+
max_corrected_read_errors_store);
3322+
32923323
static ssize_t
32933324
null_show(mddev_t *mddev, char *page)
32943325
{
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
39143945
&md_array_state.attr,
39153946
&md_reshape_position.attr,
39163947
&md_array_size.attr,
3948+
&max_corr_read_errors.attr,
39173949
NULL,
39183950
};
39193951

@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
43334365
mddev->ro = 0;
43344366

43354367
atomic_set(&mddev->writes_pending,0);
4368+
atomic_set(&mddev->max_corr_read_errors,
4369+
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
43364370
mddev->safemode = 0;
43374371
mddev->safemode_timer.function = md_safemode_timeout;
43384372
mddev->safemode_timer.data = (unsigned long) mddev;

drivers/md/md.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ struct mdk_rdev_s
9797
atomic_t read_errors; /* number of consecutive read errors that
9898
* we have tried to ignore.
9999
*/
100+
struct timespec last_read_error; /* monotonic time since our
101+
* last read error
102+
*/
100103
atomic_t corrected_errors; /* number of corrected read errors,
101104
* for reporting to userspace and storing
102105
* in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
299302
int external;
300303
} bitmap_info;
301304

305+
atomic_t max_corr_read_errors; /* max read retries */
302306
struct list_head all_mddevs;
303307

304308
/* Generic barrier handling.

drivers/md/raid10.c

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
14311431
}
14321432

14331433

1434+
/*
1435+
* Used by fix_read_error() to decay the per rdev read_errors.
1436+
* We halve the read error count for every hour that has elapsed
1437+
* since the last recorded read error.
1438+
*
1439+
*/
1440+
static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1441+
{
1442+
struct timespec cur_time_mon;
1443+
unsigned long hours_since_last;
1444+
unsigned int read_errors = atomic_read(&rdev->read_errors);
1445+
1446+
ktime_get_ts(&cur_time_mon);
1447+
1448+
if (rdev->last_read_error.tv_sec == 0 &&
1449+
rdev->last_read_error.tv_nsec == 0) {
1450+
/* first time we've seen a read error */
1451+
rdev->last_read_error = cur_time_mon;
1452+
return;
1453+
}
1454+
1455+
hours_since_last = (cur_time_mon.tv_sec -
1456+
rdev->last_read_error.tv_sec) / 3600;
1457+
1458+
rdev->last_read_error = cur_time_mon;
1459+
1460+
/*
1461+
* if hours_since_last is > the number of bits in read_errors
1462+
* just set read errors to 0. We do this to avoid
1463+
* overflowing the shift of read_errors by hours_since_last.
1464+
*/
1465+
if (hours_since_last >= 8 * sizeof(read_errors))
1466+
atomic_set(&rdev->read_errors, 0);
1467+
else
1468+
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1469+
}
1470+
14341471
/*
14351472
* This is a kernel thread which:
14361473
*
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
14441481
int sect = 0; /* Offset from r10_bio->sector */
14451482
int sectors = r10_bio->sectors;
14461483
mdk_rdev_t*rdev;
1484+
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1485+
1486+
rcu_read_lock();
1487+
{
1488+
int d = r10_bio->devs[r10_bio->read_slot].devnum;
1489+
char b[BDEVNAME_SIZE];
1490+
int cur_read_error_count = 0;
1491+
1492+
rdev = rcu_dereference(conf->mirrors[d].rdev);
1493+
bdevname(rdev->bdev, b);
1494+
1495+
if (test_bit(Faulty, &rdev->flags)) {
1496+
rcu_read_unlock();
1497+
/* drive has already been failed, just ignore any
1498+
more fix_read_error() attempts */
1499+
return;
1500+
}
1501+
1502+
check_decay_read_errors(mddev, rdev);
1503+
atomic_inc(&rdev->read_errors);
1504+
cur_read_error_count = atomic_read(&rdev->read_errors);
1505+
if (cur_read_error_count > max_read_errors) {
1506+
rcu_read_unlock();
1507+
printk(KERN_NOTICE
1508+
"raid10: %s: Raid device exceeded "
1509+
"read_error threshold "
1510+
"[cur %d:max %d]\n",
1511+
b, cur_read_error_count, max_read_errors);
1512+
printk(KERN_NOTICE
1513+
"raid10: %s: Failing raid "
1514+
"device\n", b);
1515+
md_error(mddev, conf->mirrors[d].rdev);
1516+
return;
1517+
}
1518+
}
1519+
rcu_read_unlock();
1520+
14471521
while(sectors) {
14481522
int s = sectors;
14491523
int sl = r10_bio->read_slot;

0 commit comments

Comments
 (0)