1818 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1919 */
2020
21+ #include "dm-bio-list.h"
2122#include <linux/raid/raid10.h>
23+ #include <linux/raid/bitmap.h>
2224
2325/*
2426 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
306308 /*
307309 * this branch is our 'one mirror IO has finished' event handler:
308310 */
309- if (!uptodate )
311+ if (!uptodate ) {
310312 md_error (r10_bio -> mddev , conf -> mirrors [dev ].rdev );
311- else
313+ /* an I/O failed, we can't clear the bitmap */
314+ set_bit (R10BIO_Degraded , & r10_bio -> state );
315+ } else
312316 /*
313317 * Set R10BIO_Uptodate in our master bio, so that
314318 * we will return a good error code for to the higher
@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
328332 * already.
329333 */
330334 if (atomic_dec_and_test (& r10_bio -> remaining )) {
335+ /* clear the bitmap if all writes complete successfully */
336+ bitmap_endwrite (r10_bio -> mddev -> bitmap , r10_bio -> sector ,
337+ r10_bio -> sectors ,
338+ !test_bit (R10BIO_Degraded , & r10_bio -> state ),
339+ 0 );
331340 md_write_end (r10_bio -> mddev );
332341 raid_end_bio_io (r10_bio );
333342 }
@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
486495 rcu_read_lock ();
487496 /*
488497 * Check if we can balance. We can balance on the whole
489- * device if no resync is going on, or below the resync window.
490- * We take the first readable disk when above the resync window.
498+ * device if no resync is going on (recovery is ok), or below
499+ * the resync window. We take the first readable disk when
500+ * above the resync window.
491501 */
492502 if (conf -> mddev -> recovery_cp < MaxSector
493503 && (this_sector + sectors >= conf -> next_resync )) {
@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
591601
592602static void raid10_unplug (request_queue_t * q )
593603{
604+ mddev_t * mddev = q -> queuedata ;
605+
594606 unplug_slaves (q -> queuedata );
607+ md_wakeup_thread (mddev -> thread );
595608}
596609
597610static int raid10_issue_flush (request_queue_t * q , struct gendisk * disk ,
@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
647660 */
648661#define RESYNC_DEPTH 32
649662
650- static void raise_barrier (conf_t * conf )
663+ static void raise_barrier (conf_t * conf , int force )
651664{
665+ BUG_ON (force && !conf -> barrier );
652666 spin_lock_irq (& conf -> resync_lock );
653667
654- /* Wait until no block IO is waiting */
655- wait_event_lock_irq (conf -> wait_barrier , !conf -> nr_waiting ,
668+ /* Wait until no block IO is waiting (unless 'force') */
669+ wait_event_lock_irq (conf -> wait_barrier , force || !conf -> nr_waiting ,
656670 conf -> resync_lock ,
657671 raid10_unplug (conf -> mddev -> queue ));
658672
@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
710724 int i ;
711725 int chunk_sects = conf -> chunk_mask + 1 ;
712726 const int rw = bio_data_dir (bio );
727+ struct bio_list bl ;
728+ unsigned long flags ;
713729
714730 if (unlikely (bio_barrier (bio ))) {
715731 bio_endio (bio , bio -> bi_size , - EOPNOTSUPP );
@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
767783
768784 r10_bio -> mddev = mddev ;
769785 r10_bio -> sector = bio -> bi_sector ;
786+ r10_bio -> state = 0 ;
770787
771788 if (rw == READ ) {
772789 /*
@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
811828 !test_bit (Faulty , & rdev -> flags )) {
812829 atomic_inc (& rdev -> nr_pending );
813830 r10_bio -> devs [i ].bio = bio ;
814- } else
831+ } else {
815832 r10_bio -> devs [i ].bio = NULL ;
833+ set_bit (R10BIO_Degraded , & r10_bio -> state );
834+ }
816835 }
817836 rcu_read_unlock ();
818837
819- atomic_set (& r10_bio -> remaining , 1 );
838+ atomic_set (& r10_bio -> remaining , 0 );
820839
840+ bio_list_init (& bl );
821841 for (i = 0 ; i < conf -> copies ; i ++ ) {
822842 struct bio * mbio ;
823843 int d = r10_bio -> devs [i ].devnum ;
@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
835855 mbio -> bi_private = r10_bio ;
836856
837857 atomic_inc (& r10_bio -> remaining );
838- generic_make_request ( mbio );
858+ bio_list_add ( & bl , mbio );
839859 }
840860
841- if (atomic_dec_and_test (& r10_bio -> remaining )) {
842- md_write_end (mddev );
843- raid_end_bio_io (r10_bio );
844- }
861+ bitmap_startwrite (mddev -> bitmap , bio -> bi_sector , r10_bio -> sectors , 0 );
862+ spin_lock_irqsave (& conf -> device_lock , flags );
863+ bio_list_merge (& conf -> pending_bio_list , & bl );
864+ blk_plug_device (mddev -> queue );
865+ spin_unlock_irqrestore (& conf -> device_lock , flags );
845866
846867 return 0 ;
847868}
@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
9991020 if (!enough (conf ))
10001021 return 0 ;
10011022
1002- for (mirror = 0 ; mirror < mddev -> raid_disks ; mirror ++ )
1023+ if (rdev -> saved_raid_disk >= 0 &&
1024+ conf -> mirrors [rdev -> saved_raid_disk ].rdev == NULL )
1025+ mirror = rdev -> saved_raid_disk ;
1026+ else
1027+ mirror = 0 ;
1028+ for ( ; mirror < mddev -> raid_disks ; mirror ++ )
10031029 if ( !(p = conf -> mirrors + mirror )-> rdev ) {
10041030
10051031 blk_queue_stack_limits (mddev -> queue ,
@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
10151041 p -> head_position = 0 ;
10161042 rdev -> raid_disk = mirror ;
10171043 found = 1 ;
1044+ if (rdev -> saved_raid_disk != mirror )
1045+ conf -> fullsync = 1 ;
10181046 rcu_assign_pointer (p -> rdev , rdev );
10191047 break ;
10201048 }
@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
12821310 for (;;) {
12831311 char b [BDEVNAME_SIZE ];
12841312 spin_lock_irqsave (& conf -> device_lock , flags );
1313+
1314+ if (conf -> pending_bio_list .head ) {
1315+ bio = bio_list_get (& conf -> pending_bio_list );
1316+ blk_remove_plug (mddev -> queue );
1317+ spin_unlock_irqrestore (& conf -> device_lock , flags );
1318+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1319+ if (bitmap_unplug (mddev -> bitmap ) != 0 )
1320+ printk ("%s: bitmap file write failed!\n" , mdname (mddev ));
1321+
1322+ while (bio ) { /* submit pending writes */
1323+ struct bio * next = bio -> bi_next ;
1324+ bio -> bi_next = NULL ;
1325+ generic_make_request (bio );
1326+ bio = next ;
1327+ }
1328+ unplug = 1 ;
1329+
1330+ continue ;
1331+ }
1332+
12851333 if (list_empty (head ))
12861334 break ;
12871335 r10_bio = list_entry (head -> prev , r10bio_t , retry_list );
@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
13881436 sector_t max_sector , nr_sectors ;
13891437 int disk ;
13901438 int i ;
1439+ int max_sync ;
1440+ int sync_blocks ;
13911441
13921442 sector_t sectors_skipped = 0 ;
13931443 int chunks_skipped = 0 ;
@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14011451 if (test_bit (MD_RECOVERY_SYNC , & mddev -> recovery ))
14021452 max_sector = mddev -> resync_max_sectors ;
14031453 if (sector_nr >= max_sector ) {
1454+ /* If we aborted, we need to abort the
1455+ * sync on the 'current' bitmap chucks (there can
1456+ * be several when recovering multiple devices).
1457+ * as we may have started syncing it but not finished.
1458+ * We can find the current address in
1459+ * mddev->curr_resync, but for recovery,
1460+ * we need to convert that to several
1461+ * virtual addresses.
1462+ */
1463+ if (mddev -> curr_resync < max_sector ) { /* aborted */
1464+ if (test_bit (MD_RECOVERY_SYNC , & mddev -> recovery ))
1465+ bitmap_end_sync (mddev -> bitmap , mddev -> curr_resync ,
1466+ & sync_blocks , 1 );
1467+ else for (i = 0 ; i < conf -> raid_disks ; i ++ ) {
1468+ sector_t sect =
1469+ raid10_find_virt (conf , mddev -> curr_resync , i );
1470+ bitmap_end_sync (mddev -> bitmap , sect ,
1471+ & sync_blocks , 1 );
1472+ }
1473+ } else /* completed sync */
1474+ conf -> fullsync = 0 ;
1475+
1476+ bitmap_close_sync (mddev -> bitmap );
14041477 close_sync (conf );
14051478 * skipped = 1 ;
14061479 return sectors_skipped ;
@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14251498 */
14261499 if (!go_faster && conf -> nr_waiting )
14271500 msleep_interruptible (1000 );
1428- raise_barrier (conf );
1429- conf -> next_resync = sector_nr ;
14301501
14311502 /* Again, very different code for resync and recovery.
14321503 * Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14431514 * end_sync_write if we will want to write.
14441515 */
14451516
1517+ max_sync = RESYNC_PAGES << (PAGE_SHIFT - 9 );
14461518 if (!test_bit (MD_RECOVERY_SYNC , & mddev -> recovery )) {
14471519 /* recovery... the complicated one */
14481520 int i , j , k ;
@@ -1451,22 +1523,51 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
14511523 for (i = 0 ; i < conf -> raid_disks ; i ++ )
14521524 if (conf -> mirrors [i ].rdev &&
14531525 !test_bit (In_sync , & conf -> mirrors [i ].rdev -> flags )) {
1526+ int still_degraded = 0 ;
14541527 /* want to reconstruct this device */
14551528 r10bio_t * rb2 = r10_bio ;
1529+ sector_t sect = raid10_find_virt (conf , sector_nr , i );
1530+ int must_sync ;
1531+ /* Unless we are doing a full sync, we only need
1532+ * to recover the block if it is set in the bitmap
1533+ */
1534+ must_sync = bitmap_start_sync (mddev -> bitmap , sect ,
1535+ & sync_blocks , 1 );
1536+ if (sync_blocks < max_sync )
1537+ max_sync = sync_blocks ;
1538+ if (!must_sync &&
1539+ !conf -> fullsync ) {
1540+ /* yep, skip the sync_blocks here, but don't assume
1541+ * that there will never be anything to do here
1542+ */
1543+ chunks_skipped = -1 ;
1544+ continue ;
1545+ }
14561546
14571547 r10_bio = mempool_alloc (conf -> r10buf_pool , GFP_NOIO );
1458- spin_lock_irq (& conf -> resync_lock );
1459- if (rb2 ) conf -> barrier ++ ;
1460- spin_unlock_irq (& conf -> resync_lock );
1548+ raise_barrier (conf , rb2 != NULL );
14611549 atomic_set (& r10_bio -> remaining , 0 );
14621550
14631551 r10_bio -> master_bio = (struct bio * )rb2 ;
14641552 if (rb2 )
14651553 atomic_inc (& rb2 -> remaining );
14661554 r10_bio -> mddev = mddev ;
14671555 set_bit (R10BIO_IsRecover , & r10_bio -> state );
1468- r10_bio -> sector = raid10_find_virt (conf , sector_nr , i );
1556+ r10_bio -> sector = sect ;
1557+
14691558 raid10_find_phys (conf , r10_bio );
1559+ /* Need to check if this section will still be
1560+ * degraded
1561+ */
1562+ for (j = 0 ; j < conf -> copies ;j ++ ) {
1563+ int d = r10_bio -> devs [j ].devnum ;
1564+ if (conf -> mirrors [d ].rdev == NULL ||
1565+ test_bit (Faulty , & conf -> mirrors [d ].rdev -> flags ))
1566+ still_degraded = 1 ;
1567+ }
1568+ must_sync = bitmap_start_sync (mddev -> bitmap , sect ,
1569+ & sync_blocks , still_degraded );
1570+
14701571 for (j = 0 ; j < conf -> copies ;j ++ ) {
14711572 int d = r10_bio -> devs [j ].devnum ;
14721573 if (conf -> mirrors [d ].rdev &&
@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
15261627 } else {
15271628 /* resync. Schedule a read for every block at this virt offset */
15281629 int count = 0 ;
1630+
1631+ if (!bitmap_start_sync (mddev -> bitmap , sector_nr ,
1632+ & sync_blocks , mddev -> degraded ) &&
1633+ !conf -> fullsync && !test_bit (MD_RECOVERY_REQUESTED , & mddev -> recovery )) {
1634+ /* We can skip this block */
1635+ * skipped = 1 ;
1636+ return sync_blocks + sectors_skipped ;
1637+ }
1638+ if (sync_blocks < max_sync )
1639+ max_sync = sync_blocks ;
15291640 r10_bio = mempool_alloc (conf -> r10buf_pool , GFP_NOIO );
15301641
15311642 r10_bio -> mddev = mddev ;
15321643 atomic_set (& r10_bio -> remaining , 0 );
1644+ raise_barrier (conf , 0 );
1645+ conf -> next_resync = sector_nr ;
15331646
15341647 r10_bio -> master_bio = NULL ;
15351648 r10_bio -> sector = sector_nr ;
@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
15821695 }
15831696
15841697 nr_sectors = 0 ;
1698+ if (sector_nr + max_sync < max_sector )
1699+ max_sector = sector_nr + max_sync ;
15851700 do {
15861701 struct page * page ;
15871702 int len = PAGE_SIZE ;
@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
18211936 return 0 ;
18221937}
18231938
1939+ static void raid10_quiesce (mddev_t * mddev , int state )
1940+ {
1941+ conf_t * conf = mddev_to_conf (mddev );
1942+
1943+ switch (state ) {
1944+ case 1 :
1945+ raise_barrier (conf , 0 );
1946+ break ;
1947+ case 0 :
1948+ lower_barrier (conf );
1949+ break ;
1950+ }
1951+ if (mddev -> thread ) {
1952+ if (mddev -> bitmap )
1953+ mddev -> thread -> timeout = mddev -> bitmap -> daemon_sleep * HZ ;
1954+ else
1955+ mddev -> thread -> timeout = MAX_SCHEDULE_TIMEOUT ;
1956+ md_wakeup_thread (mddev -> thread );
1957+ }
1958+ }
18241959
18251960static mdk_personality_t raid10_personality =
18261961{
@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
18351970 .hot_remove_disk = raid10_remove_disk ,
18361971 .spare_active = raid10_spare_active ,
18371972 .sync_request = sync_request ,
1973+ .quiesce = raid10_quiesce ,
18381974};
18391975
18401976static int __init raid_init (void )
0 commit comments