@@ -159,6 +159,9 @@ struct r5l_log {
159159
160160 spinlock_t stripe_in_journal_lock ;
161161 atomic_t stripe_in_journal_count ;
162+
163+ /* to submit async io_units, to fulfill ordering of flush */
164+ struct work_struct deferred_io_work ;
162165};
163166
164167/*
@@ -185,6 +188,18 @@ struct r5l_io_unit {
185188
186189 int state ;
187190 bool need_split_bio ;
191+ struct bio * split_bio ;
192+
193+ unsigned int has_flush :1 ; /* include flush request */
194+ unsigned int has_fua :1 ; /* include fua request */
195+ unsigned int has_null_flush :1 ; /* include empty flush request */
196+ /*
197+ * io isn't sent yet, flush/fua request can only be submitted till it's
198+ * the first IO in running_ios list
199+ */
200+ unsigned int io_deferred :1 ;
201+
202+ struct bio_list flush_barriers ; /* size == 0 flush bios */
188203};
189204
190205/* r5l_io_unit state */
@@ -494,9 +509,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
494509 }
495510}
496511
512+ static void __r5l_stripe_write_finished (struct r5l_io_unit * io );
497513static void r5l_log_endio (struct bio * bio )
498514{
499515 struct r5l_io_unit * io = bio -> bi_private ;
516+ struct r5l_io_unit * io_deferred ;
500517 struct r5l_log * log = io -> log ;
501518 unsigned long flags ;
502519
@@ -512,18 +529,89 @@ static void r5l_log_endio(struct bio *bio)
512529 r5l_move_to_end_ios (log );
513530 else
514531 r5l_log_run_stripes (log );
532+ if (!list_empty (& log -> running_ios )) {
533+ /*
534+ * FLUSH/FUA io_unit is deferred because of ordering, now we
535+ * can dispatch it
536+ */
537+ io_deferred = list_first_entry (& log -> running_ios ,
538+ struct r5l_io_unit , log_sibling );
539+ if (io_deferred -> io_deferred )
540+ schedule_work (& log -> deferred_io_work );
541+ }
542+
515543 spin_unlock_irqrestore (& log -> io_list_lock , flags );
516544
517545 if (log -> need_cache_flush )
518546 md_wakeup_thread (log -> rdev -> mddev -> thread );
547+
548+ if (io -> has_null_flush ) {
549+ struct bio * bi ;
550+
551+ WARN_ON (bio_list_empty (& io -> flush_barriers ));
552+ while ((bi = bio_list_pop (& io -> flush_barriers )) != NULL ) {
553+ bio_endio (bi );
554+ atomic_dec (& io -> pending_stripe );
555+ }
556+ if (atomic_read (& io -> pending_stripe ) == 0 )
557+ __r5l_stripe_write_finished (io );
558+ }
559+ }
560+
561+ static void r5l_do_submit_io (struct r5l_log * log , struct r5l_io_unit * io )
562+ {
563+ unsigned long flags ;
564+
565+ spin_lock_irqsave (& log -> io_list_lock , flags );
566+ __r5l_set_io_unit_state (io , IO_UNIT_IO_START );
567+ spin_unlock_irqrestore (& log -> io_list_lock , flags );
568+
569+ if (io -> has_flush )
570+ bio_set_op_attrs (io -> current_bio , REQ_OP_WRITE , WRITE_FLUSH );
571+ if (io -> has_fua )
572+ bio_set_op_attrs (io -> current_bio , REQ_OP_WRITE , WRITE_FUA );
573+ submit_bio (io -> current_bio );
574+
575+ if (!io -> split_bio )
576+ return ;
577+
578+ if (io -> has_flush )
579+ bio_set_op_attrs (io -> split_bio , REQ_OP_WRITE , WRITE_FLUSH );
580+ if (io -> has_fua )
581+ bio_set_op_attrs (io -> split_bio , REQ_OP_WRITE , WRITE_FUA );
582+ submit_bio (io -> split_bio );
583+ }
584+
585+ /* deferred io_unit will be dispatched here */
586+ static void r5l_submit_io_async (struct work_struct * work )
587+ {
588+ struct r5l_log * log = container_of (work , struct r5l_log ,
589+ deferred_io_work );
590+ struct r5l_io_unit * io = NULL ;
591+ unsigned long flags ;
592+
593+ spin_lock_irqsave (& log -> io_list_lock , flags );
594+ if (!list_empty (& log -> running_ios )) {
595+ io = list_first_entry (& log -> running_ios , struct r5l_io_unit ,
596+ log_sibling );
597+ if (!io -> io_deferred )
598+ io = NULL ;
599+ else
600+ io -> io_deferred = 0 ;
601+ }
602+ spin_unlock_irqrestore (& log -> io_list_lock , flags );
603+ if (io )
604+ r5l_do_submit_io (log , io );
519605}
520606
521607static void r5l_submit_current_io (struct r5l_log * log )
522608{
523609 struct r5l_io_unit * io = log -> current_io ;
610+ struct bio * bio ;
524611 struct r5l_meta_block * block ;
525612 unsigned long flags ;
526613 u32 crc ;
614+ bool do_submit = true;
527615
528616 if (!io )
529617 return ;
@@ -532,13 +620,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
532620 block -> meta_size = cpu_to_le32 (io -> meta_offset );
533621 crc = crc32c_le (log -> uuid_checksum , block , PAGE_SIZE );
534622 block -> checksum = cpu_to_le32 (crc );
623+ bio = io -> current_bio ;
535624
536625 log -> current_io = NULL ;
537626 spin_lock_irqsave (& log -> io_list_lock , flags );
538- __r5l_set_io_unit_state (io , IO_UNIT_IO_START );
627+ if (io -> has_flush || io -> has_fua ) {
628+ if (io != list_first_entry (& log -> running_ios ,
629+ struct r5l_io_unit , log_sibling )) {
630+ io -> io_deferred = 1 ;
631+ do_submit = false;
632+ }
633+ }
539634 spin_unlock_irqrestore (& log -> io_list_lock , flags );
540-
541- submit_bio ( io -> current_bio );
635+ if ( do_submit )
636+ r5l_do_submit_io ( log , io );
542637}
543638
544639static struct bio * r5l_bio_alloc (struct r5l_log * log )
@@ -583,6 +678,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
583678 io -> log = log ;
584679 INIT_LIST_HEAD (& io -> log_sibling );
585680 INIT_LIST_HEAD (& io -> stripe_list );
681+ bio_list_init (& io -> flush_barriers );
586682 io -> state = IO_UNIT_RUNNING ;
587683
588684 io -> meta_page = mempool_alloc (log -> meta_pool , GFP_NOIO );
@@ -653,12 +749,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
653749 struct r5l_io_unit * io = log -> current_io ;
654750
655751 if (io -> need_split_bio ) {
656- struct bio * prev = io -> current_bio ;
657-
752+ BUG_ON ( io -> split_bio ) ;
753+ io -> split_bio = io -> current_bio ;
658754 io -> current_bio = r5l_bio_alloc (log );
659- bio_chain (io -> current_bio , prev );
660-
661- submit_bio (prev );
755+ bio_chain (io -> current_bio , io -> split_bio );
756+ io -> need_split_bio = false;
662757 }
663758
664759 if (!bio_add_page (io -> current_bio , page , PAGE_SIZE , 0 ))
@@ -687,12 +782,24 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
687782
688783 io = log -> current_io ;
689784
785+ if (test_and_clear_bit (STRIPE_R5C_PREFLUSH , & sh -> state ))
786+ io -> has_flush = 1 ;
787+
690788 for (i = 0 ; i < sh -> disks ; i ++ ) {
691789 if (!test_bit (R5_Wantwrite , & sh -> dev [i ].flags ) ||
692790 test_bit (R5_InJournal , & sh -> dev [i ].flags ))
693791 continue ;
694792 if (i == sh -> pd_idx || i == sh -> qd_idx )
695793 continue ;
794+ if (test_bit (R5_WantFUA , & sh -> dev [i ].flags ) &&
795+ log -> r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK ) {
796+ io -> has_fua = 1 ;
797+ /*
798+ * we need to flush journal to make sure recovery can
799+ * reach the data with fua flag
800+ */
801+ io -> has_flush = 1 ;
802+ }
696803 r5l_append_payload_meta (log , R5LOG_PAYLOAD_DATA ,
697804 raid5_compute_blocknr (sh , i , 0 ),
698805 sh -> dev [i ].log_checksum , 0 , false);
@@ -856,17 +963,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
856963{
857964 if (!log )
858965 return - ENODEV ;
859- /*
860- * we flush log disk cache first, then write stripe data to raid disks.
861- * So if bio is finished, the log disk cache is flushed already. The
862- * recovery guarantees we can recovery the bio from log disk, so we
863- * don't need to flush again
864- */
865- if (bio -> bi_iter .bi_size == 0 ) {
866- bio_endio (bio );
867- return 0 ;
966+
967+ if (log -> r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH ) {
968+ /*
969+ * in write through (journal only)
970+ * we flush log disk cache first, then write stripe data to
971+ * raid disks. So if bio is finished, the log disk cache is
972+ * flushed already. The recovery guarantees we can recovery
973+ * the bio from log disk, so we don't need to flush again
974+ */
975+ if (bio -> bi_iter .bi_size == 0 ) {
976+ bio_endio (bio );
977+ return 0 ;
978+ }
979+ bio -> bi_opf &= ~REQ_PREFLUSH ;
980+ } else {
981+ /* write back (with cache) */
982+ if (bio -> bi_iter .bi_size == 0 ) {
983+ mutex_lock (& log -> io_mutex );
984+ r5l_get_meta (log , 0 );
985+ bio_list_add (& log -> current_io -> flush_barriers , bio );
986+ log -> current_io -> has_flush = 1 ;
987+ log -> current_io -> has_null_flush = 1 ;
988+ atomic_inc (& log -> current_io -> pending_stripe );
989+ r5l_submit_current_io (log );
990+ mutex_unlock (& log -> io_mutex );
991+ return 0 ;
992+ }
868993 }
869- bio -> bi_opf &= ~REQ_PREFLUSH ;
870994 return - EAGAIN ;
871995}
872996
@@ -2470,6 +2594,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
24702594 INIT_LIST_HEAD (& log -> no_space_stripes );
24712595 spin_lock_init (& log -> no_space_stripes_lock );
24722596
2597+ INIT_WORK (& log -> deferred_io_work , r5l_submit_io_async );
2598+
24732599 log -> r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH ;
24742600 INIT_LIST_HEAD (& log -> stripe_in_journal_list );
24752601 spin_lock_init (& log -> stripe_in_journal_lock );
0 commit comments