Skip to content

Commit 6cce3b2

Browse files
neilbrownLinus Torvalds
authored andcommitted
[PATCH] md: write intent bitmap support for raid10
Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1 parent b15c2e5 commit 6cce3b2

File tree

3 files changed

+171
-26
lines changed

3 files changed

+171
-26
lines changed

drivers/md/md.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
714714

715715
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
716716
mddev->bitmap_file == NULL) {
717-
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
717+
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
718+
&& mddev->level != 10) {
718719
/* FIXME use a better test */
719-
printk(KERN_WARNING "md: bitmaps only support for raid1\n");
720+
printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
720721
return -EINVAL;
721722
}
722723
mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
10371038

10381039
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
10391040
mddev->bitmap_file == NULL ) {
1040-
if (mddev->level != 1) {
1041-
printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
1041+
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1042+
&& mddev->level != 10) {
1043+
printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
10421044
return -EINVAL;
10431045
}
10441046
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);

drivers/md/raid10.c

Lines changed: 157 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1919
*/
2020

21+
#include "dm-bio-list.h"
2122
#include <linux/raid/raid10.h>
23+
#include <linux/raid/bitmap.h>
2224

2325
/*
2426
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
306308
/*
307309
* this branch is our 'one mirror IO has finished' event handler:
308310
*/
309-
if (!uptodate)
311+
if (!uptodate) {
310312
md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
311-
else
313+
/* an I/O failed, we can't clear the bitmap */
314+
set_bit(R10BIO_Degraded, &r10_bio->state);
315+
} else
312316
/*
313317
* Set R10BIO_Uptodate in our master bio, so that
314318
* we will return a good error code for to the higher
@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
328332
* already.
329333
*/
330334
if (atomic_dec_and_test(&r10_bio->remaining)) {
335+
/* clear the bitmap if all writes complete successfully */
336+
bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
337+
r10_bio->sectors,
338+
!test_bit(R10BIO_Degraded, &r10_bio->state),
339+
0);
331340
md_write_end(r10_bio->mddev);
332341
raid_end_bio_io(r10_bio);
333342
}
@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
486495
rcu_read_lock();
487496
/*
488497
* Check if we can balance. We can balance on the whole
489-
* device if no resync is going on, or below the resync window.
490-
* We take the first readable disk when above the resync window.
498+
* device if no resync is going on (recovery is ok), or below
499+
* the resync window. We take the first readable disk when
500+
* above the resync window.
491501
*/
492502
if (conf->mddev->recovery_cp < MaxSector
493503
&& (this_sector + sectors >= conf->next_resync)) {
@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
591601

592602
static void raid10_unplug(request_queue_t *q)
593603
{
604+
mddev_t *mddev = q->queuedata;
605+
594606
unplug_slaves(q->queuedata);
607+
md_wakeup_thread(mddev->thread);
595608
}
596609

597610
static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
647660
*/
648661
#define RESYNC_DEPTH 32
649662

650-
static void raise_barrier(conf_t *conf)
663+
static void raise_barrier(conf_t *conf, int force)
651664
{
665+
BUG_ON(force && !conf->barrier);
652666
spin_lock_irq(&conf->resync_lock);
653667

654-
/* Wait until no block IO is waiting */
655-
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
668+
/* Wait until no block IO is waiting (unless 'force') */
669+
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
656670
conf->resync_lock,
657671
raid10_unplug(conf->mddev->queue));
658672

@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
710724
int i;
711725
int chunk_sects = conf->chunk_mask + 1;
712726
const int rw = bio_data_dir(bio);
727+
struct bio_list bl;
728+
unsigned long flags;
713729

714730
if (unlikely(bio_barrier(bio))) {
715731
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
767783

768784
r10_bio->mddev = mddev;
769785
r10_bio->sector = bio->bi_sector;
786+
r10_bio->state = 0;
770787

771788
if (rw == READ) {
772789
/*
@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
811828
!test_bit(Faulty, &rdev->flags)) {
812829
atomic_inc(&rdev->nr_pending);
813830
r10_bio->devs[i].bio = bio;
814-
} else
831+
} else {
815832
r10_bio->devs[i].bio = NULL;
833+
set_bit(R10BIO_Degraded, &r10_bio->state);
834+
}
816835
}
817836
rcu_read_unlock();
818837

819-
atomic_set(&r10_bio->remaining, 1);
838+
atomic_set(&r10_bio->remaining, 0);
820839

840+
bio_list_init(&bl);
821841
for (i = 0; i < conf->copies; i++) {
822842
struct bio *mbio;
823843
int d = r10_bio->devs[i].devnum;
@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
835855
mbio->bi_private = r10_bio;
836856

837857
atomic_inc(&r10_bio->remaining);
838-
generic_make_request(mbio);
858+
bio_list_add(&bl, mbio);
839859
}
840860

841-
if (atomic_dec_and_test(&r10_bio->remaining)) {
842-
md_write_end(mddev);
843-
raid_end_bio_io(r10_bio);
844-
}
861+
bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
862+
spin_lock_irqsave(&conf->device_lock, flags);
863+
bio_list_merge(&conf->pending_bio_list, &bl);
864+
blk_plug_device(mddev->queue);
865+
spin_unlock_irqrestore(&conf->device_lock, flags);
845866

846867
return 0;
847868
}
@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
9991020
if (!enough(conf))
10001021
return 0;
10011022

1002-
for (mirror=0; mirror < mddev->raid_disks; mirror++)
1023+
if (rdev->saved_raid_disk >= 0 &&
1024+
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1025+
mirror = rdev->saved_raid_disk;
1026+
else
1027+
mirror = 0;
1028+
for ( ; mirror < mddev->raid_disks; mirror++)
10031029
if ( !(p=conf->mirrors+mirror)->rdev) {
10041030

10051031
blk_queue_stack_limits(mddev->queue,
@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
10151041
p->head_position = 0;
10161042
rdev->raid_disk = mirror;
10171043
found = 1;
1044+
if (rdev->saved_raid_disk != mirror)
1045+
conf->fullsync = 1;
10181046
rcu_assign_pointer(p->rdev, rdev);
10191047
break;
10201048
}
@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
12821310
for (;;) {
12831311
char b[BDEVNAME_SIZE];
12841312
spin_lock_irqsave(&conf->device_lock, flags);
1313+
1314+
if (conf->pending_bio_list.head) {
1315+
bio = bio_list_get(&conf->pending_bio_list);
1316+
blk_remove_plug(mddev->queue);
1317+
spin_unlock_irqrestore(&conf->device_lock, flags);
1318+
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
1319+
if (bitmap_unplug(mddev->bitmap) != 0)
1320+
printk("%s: bitmap file write failed!\n", mdname(mddev));
1321+
1322+
while (bio) { /* submit pending writes */
1323+
struct bio *next = bio->bi_next;
1324+
bio->bi_next = NULL;
1325+
generic_make_request(bio);
1326+
bio = next;
1327+
}
1328+
unplug = 1;
1329+
1330+
continue;
1331+
}
1332+
12851333
if (list_empty(head))
12861334
break;
12871335
r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
13881436
sector_t max_sector, nr_sectors;
13891437
int disk;
13901438
int i;
1439+
int max_sync;
1440+
int sync_blocks;
13911441

13921442
sector_t sectors_skipped = 0;
13931443
int chunks_skipped = 0;
@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14011451
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
14021452
max_sector = mddev->resync_max_sectors;
14031453
if (sector_nr >= max_sector) {
1454+
/* If we aborted, we need to abort the
1455+
* sync on the 'current' bitmap chucks (there can
1456+
* be several when recovering multiple devices).
1457+
* as we may have started syncing it but not finished.
1458+
* We can find the current address in
1459+
* mddev->curr_resync, but for recovery,
1460+
* we need to convert that to several
1461+
* virtual addresses.
1462+
*/
1463+
if (mddev->curr_resync < max_sector) { /* aborted */
1464+
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1465+
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1466+
&sync_blocks, 1);
1467+
else for (i=0; i<conf->raid_disks; i++) {
1468+
sector_t sect =
1469+
raid10_find_virt(conf, mddev->curr_resync, i);
1470+
bitmap_end_sync(mddev->bitmap, sect,
1471+
&sync_blocks, 1);
1472+
}
1473+
} else /* completed sync */
1474+
conf->fullsync = 0;
1475+
1476+
bitmap_close_sync(mddev->bitmap);
14041477
close_sync(conf);
14051478
*skipped = 1;
14061479
return sectors_skipped;
@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14251498
*/
14261499
if (!go_faster && conf->nr_waiting)
14271500
msleep_interruptible(1000);
1428-
raise_barrier(conf);
1429-
conf->next_resync = sector_nr;
14301501

14311502
/* Again, very different code for resync and recovery.
14321503
* Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14431514
* end_sync_write if we will want to write.
14441515
*/
14451516

1517+
max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
14461518
if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
14471519
/* recovery... the complicated one */
14481520
int i, j, k;
@@ -1451,22 +1523,51 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14511523
for (i=0 ; i<conf->raid_disks; i++)
14521524
if (conf->mirrors[i].rdev &&
14531525
!test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1526+
int still_degraded = 0;
14541527
/* want to reconstruct this device */
14551528
r10bio_t *rb2 = r10_bio;
1529+
sector_t sect = raid10_find_virt(conf, sector_nr, i);
1530+
int must_sync;
1531+
/* Unless we are doing a full sync, we only need
1532+
* to recover the block if it is set in the bitmap
1533+
*/
1534+
must_sync = bitmap_start_sync(mddev->bitmap, sect,
1535+
&sync_blocks, 1);
1536+
if (sync_blocks < max_sync)
1537+
max_sync = sync_blocks;
1538+
if (!must_sync &&
1539+
!conf->fullsync) {
1540+
/* yep, skip the sync_blocks here, but don't assume
1541+
* that there will never be anything to do here
1542+
*/
1543+
chunks_skipped = -1;
1544+
continue;
1545+
}
14561546

14571547
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1458-
spin_lock_irq(&conf->resync_lock);
1459-
if (rb2) conf->barrier++;
1460-
spin_unlock_irq(&conf->resync_lock);
1548+
raise_barrier(conf, rb2 != NULL);
14611549
atomic_set(&r10_bio->remaining, 0);
14621550

14631551
r10_bio->master_bio = (struct bio*)rb2;
14641552
if (rb2)
14651553
atomic_inc(&rb2->remaining);
14661554
r10_bio->mddev = mddev;
14671555
set_bit(R10BIO_IsRecover, &r10_bio->state);
1468-
r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
1556+
r10_bio->sector = sect;
1557+
14691558
raid10_find_phys(conf, r10_bio);
1559+
/* Need to check if this section will still be
1560+
* degraded
1561+
*/
1562+
for (j=0; j<conf->copies;j++) {
1563+
int d = r10_bio->devs[j].devnum;
1564+
if (conf->mirrors[d].rdev == NULL ||
1565+
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1566+
still_degraded = 1;
1567+
}
1568+
must_sync = bitmap_start_sync(mddev->bitmap, sect,
1569+
&sync_blocks, still_degraded);
1570+
14701571
for (j=0; j<conf->copies;j++) {
14711572
int d = r10_bio->devs[j].devnum;
14721573
if (conf->mirrors[d].rdev &&
@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
15261627
} else {
15271628
/* resync. Schedule a read for every block at this virt offset */
15281629
int count = 0;
1630+
1631+
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1632+
&sync_blocks, mddev->degraded) &&
1633+
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1634+
/* We can skip this block */
1635+
*skipped = 1;
1636+
return sync_blocks + sectors_skipped;
1637+
}
1638+
if (sync_blocks < max_sync)
1639+
max_sync = sync_blocks;
15291640
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
15301641

15311642
r10_bio->mddev = mddev;
15321643
atomic_set(&r10_bio->remaining, 0);
1644+
raise_barrier(conf, 0);
1645+
conf->next_resync = sector_nr;
15331646

15341647
r10_bio->master_bio = NULL;
15351648
r10_bio->sector = sector_nr;
@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
15821695
}
15831696

15841697
nr_sectors = 0;
1698+
if (sector_nr + max_sync < max_sector)
1699+
max_sector = sector_nr + max_sync;
15851700
do {
15861701
struct page *page;
15871702
int len = PAGE_SIZE;
@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
18211936
return 0;
18221937
}
18231938

1939+
static void raid10_quiesce(mddev_t *mddev, int state)
1940+
{
1941+
conf_t *conf = mddev_to_conf(mddev);
1942+
1943+
switch(state) {
1944+
case 1:
1945+
raise_barrier(conf, 0);
1946+
break;
1947+
case 0:
1948+
lower_barrier(conf);
1949+
break;
1950+
}
1951+
if (mddev->thread) {
1952+
if (mddev->bitmap)
1953+
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1954+
else
1955+
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1956+
md_wakeup_thread(mddev->thread);
1957+
}
1958+
}
18241959

18251960
static mdk_personality_t raid10_personality =
18261961
{
@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
18351970
.hot_remove_disk= raid10_remove_disk,
18361971
.spare_active = raid10_spare_active,
18371972
.sync_request = sync_request,
1973+
.quiesce = raid10_quiesce,
18381974
};
18391975

18401976
static int __init raid_init(void)

0 commit comments

Comments
 (0)