@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
810810static int __bch2_buffered_write (struct bch_inode_info * inode ,
811811 struct address_space * mapping ,
812812 struct iov_iter * iter ,
813- loff_t pos , unsigned len )
813+ loff_t pos , unsigned len ,
814+ bool inode_locked )
814815{
815816 struct bch_fs * c = inode -> v .i_sb -> s_fs_info ;
816817 struct bch2_folio_reservation res ;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
835836
836837 BUG_ON (!fs .nr );
837838
839+ /*
840+ * If we're not using the inode lock, we need to lock all the folios for
841+ * atomiticity of writes vs. other writes:
842+ */
843+ if (!inode_locked && folio_end_pos (darray_last (fs )) < end ) {
844+ ret = - BCH_ERR_need_inode_lock ;
845+ goto out ;
846+ }
847+
838848 f = darray_first (fs );
839849 if (pos != folio_pos (f ) && !folio_test_uptodate (f )) {
840850 ret = bch2_read_single_folio (f , mapping );
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
929939 end = pos + copied ;
930940
931941 spin_lock (& inode -> v .i_lock );
932- if (end > inode -> v .i_size )
942+ if (end > inode -> v .i_size ) {
943+ BUG_ON (!inode_locked );
933944 i_size_write (& inode -> v , end );
945+ }
934946 spin_unlock (& inode -> v .i_lock );
935947
936948 f_pos = pos ;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
974986 struct file * file = iocb -> ki_filp ;
975987 struct address_space * mapping = file -> f_mapping ;
976988 struct bch_inode_info * inode = file_bch_inode (file );
977- loff_t pos = iocb -> ki_pos ;
978- ssize_t written = 0 ;
979- int ret = 0 ;
989+ loff_t pos ;
990+ bool inode_locked = false;
991+ ssize_t written = 0 , written2 = 0 , ret = 0 ;
992+
993+ /*
994+ * We don't take the inode lock unless i_size will be changing. Folio
995+ * locks provide exclusion with other writes, and the pagecache add lock
996+ * provides exclusion with truncate and hole punching.
997+ *
998+ * There is one nasty corner case where atomicity would be broken
999+ * without great care: when copying data from userspace to the page
1000+ * cache, we do that with faults disable - a page fault would recurse
1001+ * back into the filesystem, taking filesystem locks again, and
1002+ * deadlock; so it's done with faults disabled, and we fault in the user
1003+ * buffer when we aren't holding locks.
1004+ *
1005+ * If we do part of the write, but we then race and in the userspace
1006+ * buffer have been evicted and are no longer resident, then we have to
1007+ * drop our folio locks to re-fault them in, breaking write atomicity.
1008+ *
1009+ * To fix this, we restart the write from the start, if we weren't
1010+ * holding the inode lock.
1011+ *
1012+ * There is another wrinkle after that; if we restart the write from the
1013+ * start, and then get an unrecoverable error, we _cannot_ claim to
1014+ * userspace that we did not write data we actually did - so we must
1015+ * track (written2) the most we ever wrote.
1016+ */
1017+
1018+ if ((iocb -> ki_flags & IOCB_APPEND ) ||
1019+ (iocb -> ki_pos + iov_iter_count (iter ) > i_size_read (& inode -> v ))) {
1020+ inode_lock (& inode -> v );
1021+ inode_locked = true;
1022+ }
1023+
1024+ ret = generic_write_checks (iocb , iter );
1025+ if (ret <= 0 )
1026+ goto unlock ;
1027+
1028+ ret = file_remove_privs_flags (file , !inode_locked ? IOCB_NOWAIT : 0 );
1029+ if (ret ) {
1030+ if (!inode_locked ) {
1031+ inode_lock (& inode -> v );
1032+ inode_locked = true;
1033+ ret = file_remove_privs_flags (file , 0 );
1034+ }
1035+ if (ret )
1036+ goto unlock ;
1037+ }
1038+
1039+ ret = file_update_time (file );
1040+ if (ret )
1041+ goto unlock ;
1042+
1043+ pos = iocb -> ki_pos ;
9801044
9811045 bch2_pagecache_add_get (inode );
9821046
1047+ if (!inode_locked &&
1048+ (iocb -> ki_pos + iov_iter_count (iter ) > i_size_read (& inode -> v )))
1049+ goto get_inode_lock ;
1050+
9831051 do {
9841052 unsigned offset = pos & (PAGE_SIZE - 1 );
9851053 unsigned bytes = iov_iter_count (iter );
@@ -1004,12 +1072,17 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
10041072 }
10051073 }
10061074
1075+ if (unlikely (bytes != iov_iter_count (iter ) && !inode_locked ))
1076+ goto get_inode_lock ;
1077+
10071078 if (unlikely (fatal_signal_pending (current ))) {
10081079 ret = - EINTR ;
10091080 break ;
10101081 }
10111082
1012- ret = __bch2_buffered_write (inode , mapping , iter , pos , bytes );
1083+ ret = __bch2_buffered_write (inode , mapping , iter , pos , bytes , inode_locked );
1084+ if (ret == - BCH_ERR_need_inode_lock )
1085+ goto get_inode_lock ;
10131086 if (unlikely (ret < 0 ))
10141087 break ;
10151088
@@ -1030,50 +1103,46 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
10301103 }
10311104 pos += ret ;
10321105 written += ret ;
1106+ written2 = max (written , written2 );
1107+
1108+ if (ret != bytes && !inode_locked )
1109+ goto get_inode_lock ;
10331110 ret = 0 ;
10341111
10351112 balance_dirty_pages_ratelimited (mapping );
1036- } while (iov_iter_count (iter ));
10371113
1114+ if (0 ) {
1115+ get_inode_lock :
1116+ bch2_pagecache_add_put (inode );
1117+ inode_lock (& inode -> v );
1118+ inode_locked = true;
1119+ bch2_pagecache_add_get (inode );
1120+
1121+ iov_iter_revert (iter , written );
1122+ pos -= written ;
1123+ written = 0 ;
1124+ ret = 0 ;
1125+ }
1126+ } while (iov_iter_count (iter ));
10381127 bch2_pagecache_add_put (inode );
1128+ unlock :
1129+ if (inode_locked )
1130+ inode_unlock (& inode -> v );
1131+
1132+ iocb -> ki_pos += written ;
10391133
1040- return written ? written : ret ;
1134+ ret = max (written , written2 ) ?: ret ;
1135+ if (ret > 0 )
1136+ ret = generic_write_sync (iocb , ret );
1137+ return ret ;
10411138}
10421139
1043- ssize_t bch2_write_iter (struct kiocb * iocb , struct iov_iter * from )
1140+ ssize_t bch2_write_iter (struct kiocb * iocb , struct iov_iter * iter )
10441141{
1045- struct file * file = iocb -> ki_filp ;
1046- struct bch_inode_info * inode = file_bch_inode (file );
1047- ssize_t ret ;
1048-
1049- if (iocb -> ki_flags & IOCB_DIRECT ) {
1050- ret = bch2_direct_write (iocb , from );
1051- goto out ;
1052- }
1053-
1054- inode_lock (& inode -> v );
1055-
1056- ret = generic_write_checks (iocb , from );
1057- if (ret <= 0 )
1058- goto unlock ;
1059-
1060- ret = file_remove_privs (file );
1061- if (ret )
1062- goto unlock ;
1063-
1064- ret = file_update_time (file );
1065- if (ret )
1066- goto unlock ;
1067-
1068- ret = bch2_buffered_write (iocb , from );
1069- if (likely (ret > 0 ))
1070- iocb -> ki_pos += ret ;
1071- unlock :
1072- inode_unlock (& inode -> v );
1142+ ssize_t ret = iocb -> ki_flags & IOCB_DIRECT
1143+ ? bch2_direct_write (iocb , iter )
1144+ : bch2_buffered_write (iocb , iter );
10731145
1074- if (ret > 0 )
1075- ret = generic_write_sync (iocb , ret );
1076- out :
10771146 return bch2_err_class (ret );
10781147}
10791148
0 commit comments