Skip to content

Commit 3bddb7f

Browse files
liu-song-6shligit
authored andcommitted
md/r5cache: handle FLUSH and FUA
With raid5 cache, we committing data from journal device. When there is flush request, we need to flush journal device's cache. This was not needed in raid5 journal, because we will flush the journal before committing data to raid disks. This is similar to FUA, except that we also need flush journal for FUA. Otherwise, corruptions in earlier meta data will stop recovery from reaching FUA data. slightly changed the code by Shaohua Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
1 parent 5aabf7c commit 3bddb7f

File tree

3 files changed

+157
-18
lines changed

3 files changed

+157
-18
lines changed

drivers/md/raid5-cache.c

Lines changed: 144 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ struct r5l_log {
159159

160160
spinlock_t stripe_in_journal_lock;
161161
atomic_t stripe_in_journal_count;
162+
163+
/* to submit async io_units, to fulfill ordering of flush */
164+
struct work_struct deferred_io_work;
162165
};
163166

164167
/*
@@ -185,6 +188,18 @@ struct r5l_io_unit {
185188

186189
int state;
187190
bool need_split_bio;
191+
struct bio *split_bio;
192+
193+
unsigned int has_flush:1; /* include flush request */
194+
unsigned int has_fua:1; /* include fua request */
195+
unsigned int has_null_flush:1; /* include empty flush request */
196+
/*
197+
* io isn't sent yet, flush/fua request can only be submitted till it's
198+
* the first IO in running_ios list
199+
*/
200+
unsigned int io_deferred:1;
201+
202+
struct bio_list flush_barriers; /* size == 0 flush bios */
188203
};
189204

190205
/* r5l_io_unit state */
@@ -494,9 +509,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
494509
}
495510
}
496511

512+
static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
497513
static void r5l_log_endio(struct bio *bio)
498514
{
499515
struct r5l_io_unit *io = bio->bi_private;
516+
struct r5l_io_unit *io_deferred;
500517
struct r5l_log *log = io->log;
501518
unsigned long flags;
502519

@@ -512,18 +529,89 @@ static void r5l_log_endio(struct bio *bio)
512529
r5l_move_to_end_ios(log);
513530
else
514531
r5l_log_run_stripes(log);
532+
if (!list_empty(&log->running_ios)) {
533+
/*
534+
* FLUSH/FUA io_unit is deferred because of ordering, now we
535+
* can dispatch it
536+
*/
537+
io_deferred = list_first_entry(&log->running_ios,
538+
struct r5l_io_unit, log_sibling);
539+
if (io_deferred->io_deferred)
540+
schedule_work(&log->deferred_io_work);
541+
}
542+
515543
spin_unlock_irqrestore(&log->io_list_lock, flags);
516544

517545
if (log->need_cache_flush)
518546
md_wakeup_thread(log->rdev->mddev->thread);
547+
548+
if (io->has_null_flush) {
549+
struct bio *bi;
550+
551+
WARN_ON(bio_list_empty(&io->flush_barriers));
552+
while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
553+
bio_endio(bi);
554+
atomic_dec(&io->pending_stripe);
555+
}
556+
if (atomic_read(&io->pending_stripe) == 0)
557+
__r5l_stripe_write_finished(io);
558+
}
559+
}
560+
561+
static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
562+
{
563+
unsigned long flags;
564+
565+
spin_lock_irqsave(&log->io_list_lock, flags);
566+
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
567+
spin_unlock_irqrestore(&log->io_list_lock, flags);
568+
569+
if (io->has_flush)
570+
bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
571+
if (io->has_fua)
572+
bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
573+
submit_bio(io->current_bio);
574+
575+
if (!io->split_bio)
576+
return;
577+
578+
if (io->has_flush)
579+
bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
580+
if (io->has_fua)
581+
bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
582+
submit_bio(io->split_bio);
583+
}
584+
585+
/* deferred io_unit will be dispatched here */
586+
static void r5l_submit_io_async(struct work_struct *work)
587+
{
588+
struct r5l_log *log = container_of(work, struct r5l_log,
589+
deferred_io_work);
590+
struct r5l_io_unit *io = NULL;
591+
unsigned long flags;
592+
593+
spin_lock_irqsave(&log->io_list_lock, flags);
594+
if (!list_empty(&log->running_ios)) {
595+
io = list_first_entry(&log->running_ios, struct r5l_io_unit,
596+
log_sibling);
597+
if (!io->io_deferred)
598+
io = NULL;
599+
else
600+
io->io_deferred = 0;
601+
}
602+
spin_unlock_irqrestore(&log->io_list_lock, flags);
603+
if (io)
604+
r5l_do_submit_io(log, io);
519605
}
520606

521607
static void r5l_submit_current_io(struct r5l_log *log)
522608
{
523609
struct r5l_io_unit *io = log->current_io;
610+
struct bio *bio;
524611
struct r5l_meta_block *block;
525612
unsigned long flags;
526613
u32 crc;
614+
bool do_submit = true;
527615

528616
if (!io)
529617
return;
@@ -532,13 +620,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
532620
block->meta_size = cpu_to_le32(io->meta_offset);
533621
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
534622
block->checksum = cpu_to_le32(crc);
623+
bio = io->current_bio;
535624

536625
log->current_io = NULL;
537626
spin_lock_irqsave(&log->io_list_lock, flags);
538-
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
627+
if (io->has_flush || io->has_fua) {
628+
if (io != list_first_entry(&log->running_ios,
629+
struct r5l_io_unit, log_sibling)) {
630+
io->io_deferred = 1;
631+
do_submit = false;
632+
}
633+
}
539634
spin_unlock_irqrestore(&log->io_list_lock, flags);
540-
541-
submit_bio(io->current_bio);
635+
if (do_submit)
636+
r5l_do_submit_io(log, io);
542637
}
543638

544639
static struct bio *r5l_bio_alloc(struct r5l_log *log)
@@ -583,6 +678,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
583678
io->log = log;
584679
INIT_LIST_HEAD(&io->log_sibling);
585680
INIT_LIST_HEAD(&io->stripe_list);
681+
bio_list_init(&io->flush_barriers);
586682
io->state = IO_UNIT_RUNNING;
587683

588684
io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -653,12 +749,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
653749
struct r5l_io_unit *io = log->current_io;
654750

655751
if (io->need_split_bio) {
656-
struct bio *prev = io->current_bio;
657-
752+
BUG_ON(io->split_bio);
753+
io->split_bio = io->current_bio;
658754
io->current_bio = r5l_bio_alloc(log);
659-
bio_chain(io->current_bio, prev);
660-
661-
submit_bio(prev);
755+
bio_chain(io->current_bio, io->split_bio);
756+
io->need_split_bio = false;
662757
}
663758

664759
if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
@@ -687,12 +782,24 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
687782

688783
io = log->current_io;
689784

785+
if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
786+
io->has_flush = 1;
787+
690788
for (i = 0; i < sh->disks; i++) {
691789
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
692790
test_bit(R5_InJournal, &sh->dev[i].flags))
693791
continue;
694792
if (i == sh->pd_idx || i == sh->qd_idx)
695793
continue;
794+
if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
795+
log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
796+
io->has_fua = 1;
797+
/*
798+
* we need to flush journal to make sure recovery can
799+
* reach the data with fua flag
800+
*/
801+
io->has_flush = 1;
802+
}
696803
r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
697804
raid5_compute_blocknr(sh, i, 0),
698805
sh->dev[i].log_checksum, 0, false);
@@ -856,17 +963,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
856963
{
857964
if (!log)
858965
return -ENODEV;
859-
/*
860-
* we flush log disk cache first, then write stripe data to raid disks.
861-
* So if bio is finished, the log disk cache is flushed already. The
862-
* recovery guarantees we can recovery the bio from log disk, so we
863-
* don't need to flush again
864-
*/
865-
if (bio->bi_iter.bi_size == 0) {
866-
bio_endio(bio);
867-
return 0;
966+
967+
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
968+
/*
969+
* in write through (journal only)
970+
* we flush log disk cache first, then write stripe data to
971+
* raid disks. So if bio is finished, the log disk cache is
972+
* flushed already. The recovery guarantees we can recovery
973+
* the bio from log disk, so we don't need to flush again
974+
*/
975+
if (bio->bi_iter.bi_size == 0) {
976+
bio_endio(bio);
977+
return 0;
978+
}
979+
bio->bi_opf &= ~REQ_PREFLUSH;
980+
} else {
981+
/* write back (with cache) */
982+
if (bio->bi_iter.bi_size == 0) {
983+
mutex_lock(&log->io_mutex);
984+
r5l_get_meta(log, 0);
985+
bio_list_add(&log->current_io->flush_barriers, bio);
986+
log->current_io->has_flush = 1;
987+
log->current_io->has_null_flush = 1;
988+
atomic_inc(&log->current_io->pending_stripe);
989+
r5l_submit_current_io(log);
990+
mutex_unlock(&log->io_mutex);
991+
return 0;
992+
}
868993
}
869-
bio->bi_opf &= ~REQ_PREFLUSH;
870994
return -EAGAIN;
871995
}
872996

@@ -2470,6 +2594,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
24702594
INIT_LIST_HEAD(&log->no_space_stripes);
24712595
spin_lock_init(&log->no_space_stripes_lock);
24722596

2597+
INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
2598+
24732599
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
24742600
INIT_LIST_HEAD(&log->stripe_in_journal_list);
24752601
spin_lock_init(&log->stripe_in_journal_lock);

drivers/md/raid5.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5248,6 +5248,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
52485248
int remaining;
52495249
DEFINE_WAIT(w);
52505250
bool do_prepare;
5251+
bool do_flush = false;
52515252

52525253
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
52535254
int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5259,6 +5260,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
52595260
return;
52605261
}
52615262
/* ret == -EAGAIN, fallback */
5263+
/*
5264+
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
5265+
* we need to flush journal device
5266+
*/
5267+
do_flush = bi->bi_opf & REQ_PREFLUSH;
52625268
}
52635269

52645270
md_write_start(mddev, bi);
@@ -5398,6 +5404,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
53985404
do_prepare = true;
53995405
goto retry;
54005406
}
5407+
if (do_flush) {
5408+
set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5409+
/* we only need flush for one stripe */
5410+
do_flush = false;
5411+
}
5412+
54015413
set_bit(STRIPE_HANDLE, &sh->state);
54025414
clear_bit(STRIPE_DELAYED, &sh->state);
54035415
if ((!sh->batch_head || sh == sh->batch_head) &&

drivers/md/raid5.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ enum {
376376
STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or
377377
* in conf->r5c_full_stripe_list)
378378
*/
379+
STRIPE_R5C_PREFLUSH, /* need to flush journal device */
379380
};
380381

381382
#define STRIPE_EXPAND_SYNC_FLAGS \

0 commit comments

Comments
 (0)