Skip to content

Commit 48901e9

Browse files
committed
Merge branch 'mm-hotfixes-stable' into mm-stable.
Pick these into mm-stable: 5de1950 mm: resolve faulty mmap_region() error path behaviour 5baf8b0 mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling 0fb4a7a mm: refactor map_deny_write_exec() 4080ef1 mm: unconditionally close VMAs on error 3dd6ed3 mm: avoid unsafe VMA hook invocation when error arises on mmap hook f8f931b mm/thp: fix deferred split unqueue naming and locking e66f318 mm/thp: fix deferred split queue not partially_mapped to get a clean merge of these from mm-unstable into mm-stable: Subject: memcg-v1: fully deprecate move_charge_at_immigrate Subject: memcg-v1: remove charge move code Subject: memcg-v1: no need for memcg locking for dirty tracking Subject: memcg-v1: no need for memcg locking for writeback tracking Subject: memcg-v1: no need for memcg locking for MGLRU Subject: memcg-v1: remove memcg move locking code Subject: tools: testing: add additional vma_internal.h stubs Subject: mm: isolate mmap internal logic to mm/vma.c Subject: mm: refactor __mmap_region() Subject: mm: remove unnecessary reset state logic on merge new VMA Subject: mm: defer second attempt at merge on mmap() Subject: mm/vma: the pgoff is correct if can_merge_right Subject: memcg: workingset: remove folio_memcg_rcu usage
2 parents 59b723c + 5de1950 commit 48901e9

File tree

17 files changed

+245
-125
lines changed

17 files changed

+245
-125
lines changed

arch/arm64/include/asm/mman.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#ifndef BUILD_VDSO
88
#include <linux/compiler.h>
9+
#include <linux/fs.h>
10+
#include <linux/shmem_fs.h>
911
#include <linux/types.h>
1012

1113
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
@@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
3133
}
3234
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
3335

34-
static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
36+
static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
37+
unsigned long flags)
3538
{
3639
/*
3740
* Only allow MTE on anonymous mappings as these are guaranteed to be
3841
* backed by tags-capable memory. The vm_flags may be overridden by a
3942
* filesystem supporting MTE (RAM-based).
4043
*/
41-
if (system_supports_mte() && (flags & MAP_ANONYMOUS))
44+
if (system_supports_mte() &&
45+
((flags & MAP_ANONYMOUS) || shmem_file(file)))
4246
return VM_MTE_ALLOWED;
4347

4448
return 0;
4549
}
46-
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
50+
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
4751

4852
static inline bool arch_validate_prot(unsigned long prot,
4953
unsigned long addr __always_unused)

arch/parisc/include/asm/mman.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#ifndef __ASM_MMAN_H__
33
#define __ASM_MMAN_H__
44

5+
#include <linux/fs.h>
56
#include <uapi/asm/mman.h>
67

78
/* PARISC cannot allow mdwe as it needs writable stacks */
@@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
1112
}
1213
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
1314

14-
static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
15+
static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
1516
{
1617
/*
1718
* The stack on parisc grows upwards, so if userspace requests memory
@@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
2324

2425
return 0;
2526
}
26-
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
27+
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
2728

2829
#endif /* __ASM_MMAN_H__ */

include/linux/mman.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#ifndef _LINUX_MMAN_H
33
#define _LINUX_MMAN_H
44

5+
#include <linux/fs.h>
56
#include <linux/mm.h>
67
#include <linux/percpu_counter.h>
78

@@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
9495
#endif
9596

9697
#ifndef arch_calc_vm_flag_bits
97-
#define arch_calc_vm_flag_bits(flags) 0
98+
#define arch_calc_vm_flag_bits(file, flags) 0
9899
#endif
99100

100101
#ifndef arch_validate_prot
@@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
151152
* Combine the mmap "flags" argument into "vm_flags" used internally.
152153
*/
153154
static inline unsigned long
154-
calc_vm_flag_bits(unsigned long flags)
155+
calc_vm_flag_bits(struct file *file, unsigned long flags)
155156
{
156157
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
157158
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
158159
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
159160
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
160-
arch_calc_vm_flag_bits(flags);
161+
arch_calc_vm_flag_bits(file, flags);
161162
}
162163

163164
unsigned long vm_commit_limit(void);
@@ -188,16 +189,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
188189
*
189190
* d) mmap(PROT_READ | PROT_EXEC)
190191
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
192+
*
193+
* This is only applicable if the user has set the Memory-Deny-Write-Execute
194+
* (MDWE) protection mask for the current process.
195+
*
196+
* @old specifies the VMA flags the VMA originally possessed, and @new the ones
197+
* we propose to set.
198+
*
199+
* Return: false if proposed change is OK, true if not ok and should be denied.
191200
*/
192-
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
201+
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
193202
{
203+
/* If MDWE is disabled, we have nothing to deny. */
194204
if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
195205
return false;
196206

197-
if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
207+
/* If the new VMA is not executable, we have nothing to deny. */
208+
if (!(new & VM_EXEC))
209+
return false;
210+
211+
/* Under MDWE we do not accept newly writably executable VMAs... */
212+
if (new & VM_WRITE)
198213
return true;
199214

200-
if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
215+
/* ...nor previously non-executable VMAs becoming executable. */
216+
if (!(old & VM_EXEC))
201217
return true;
202218

203219
return false;

mm/huge_memory.c

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
35883588
return split_huge_page_to_list_to_order(&folio->page, list, ret);
35893589
}
35903590

3591-
void __folio_undo_large_rmappable(struct folio *folio)
3591+
/*
3592+
* __folio_unqueue_deferred_split() is not to be called directly:
3593+
* the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
3594+
* limits its calls to those folios which may have a _deferred_list for
3595+
* queueing THP splits, and that list is (racily observed to be) non-empty.
3596+
*
3597+
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
3598+
* zero: because even when split_queue_lock is held, a non-empty _deferred_list
3599+
* might be in use on deferred_split_scan()'s unlocked on-stack list.
3600+
*
3601+
* If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
3602+
* therefore important to unqueue deferred split before changing folio memcg.
3603+
*/
3604+
bool __folio_unqueue_deferred_split(struct folio *folio)
35923605
{
35933606
struct deferred_split *ds_queue;
35943607
unsigned long flags;
3608+
bool unqueued = false;
3609+
3610+
WARN_ON_ONCE(folio_ref_count(folio));
3611+
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
35953612

35963613
ds_queue = get_deferred_split_queue(folio);
35973614
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
36033620
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
36043621
}
36053622
list_del_init(&folio->_deferred_list);
3623+
unqueued = true;
36063624
}
36073625
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3626+
3627+
return unqueued; /* useful for debug warnings */
36083628
}
36093629

36103630
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
36273647
return;
36283648

36293649
/*
3630-
* The try_to_unmap() in page reclaim path might reach here too,
3631-
* this may cause a race condition to corrupt deferred split queue.
3632-
* And, if page reclaim is already handling the same folio, it is
3633-
* unnecessary to handle it again in shrinker.
3634-
*
3635-
* Check the swapcache flag to determine if the folio is being
3636-
* handled by page reclaim since THP swap would add the folio into
3637-
* swap cache before calling try_to_unmap().
3650+
* Exclude swapcache: originally to avoid a corrupt deferred split
3651+
* queue. Nowadays that is fully prevented by mem_cgroup_swapout();
3652+
* but if page reclaim is already handling the same folio, it is
3653+
* unnecessary to handle it again in the shrinker, so excluding
3654+
* swapcache here may still be a useful optimization.
36383655
*/
36393656
if (folio_test_swapcache(folio))
36403657
return;
@@ -3718,8 +3735,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
37183735
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
37193736
unsigned long flags;
37203737
LIST_HEAD(list);
3721-
struct folio *folio, *next;
3722-
int split = 0;
3738+
struct folio *folio, *next, *prev = NULL;
3739+
int split = 0, removed = 0;
37233740

37243741
#ifdef CONFIG_MEMCG
37253742
if (sc->memcg)
@@ -3775,15 +3792,28 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
37753792
*/
37763793
if (!did_split && !folio_test_partially_mapped(folio)) {
37773794
list_del_init(&folio->_deferred_list);
3778-
ds_queue->split_queue_len--;
3795+
removed++;
3796+
} else {
3797+
/*
3798+
* That unlocked list_del_init() above would be unsafe,
3799+
* unless its folio is separated from any earlier folios
3800+
* left on the list (which may be concurrently unqueued)
3801+
* by one safe folio with refcount still raised.
3802+
*/
3803+
swap(folio, prev);
37793804
}
3780-
folio_put(folio);
3805+
if (folio)
3806+
folio_put(folio);
37813807
}
37823808

37833809
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
37843810
list_splice_tail(&list, &ds_queue->split_queue);
3811+
ds_queue->split_queue_len -= removed;
37853812
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
37863813

3814+
if (prev)
3815+
folio_put(prev);
3816+
37873817
/*
37883818
* Stop shrinker if we didn't split any page, but the queue is empty.
37893819
* This can happen if pages were freed under us.

mm/internal.h

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,51 @@ static inline void *folio_raw_mapping(const struct folio *folio)
108108
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
109109
}
110110

111+
/*
112+
* This is a file-backed mapping, and is about to be memory mapped - invoke its
113+
* mmap hook and safely handle error conditions. On error, VMA hooks will be
114+
* mutated.
115+
*
116+
* @file: File which backs the mapping.
117+
* @vma: VMA which we are mapping.
118+
*
119+
* Returns: 0 if success, error otherwise.
120+
*/
121+
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
122+
{
123+
int err = call_mmap(file, vma);
124+
125+
if (likely(!err))
126+
return 0;
127+
128+
/*
129+
* OK, we tried to call the file hook for mmap(), but an error
130+
* arose. The mapping is in an inconsistent state and we most not invoke
131+
* any further hooks on it.
132+
*/
133+
vma->vm_ops = &vma_dummy_vm_ops;
134+
135+
return err;
136+
}
137+
138+
/*
139+
* If the VMA has a close hook then close it, and since closing it might leave
140+
* it in an inconsistent state which makes the use of any hooks suspect, clear
141+
* them down by installing dummy empty hooks.
142+
*/
143+
static inline void vma_close(struct vm_area_struct *vma)
144+
{
145+
if (vma->vm_ops && vma->vm_ops->close) {
146+
vma->vm_ops->close(vma);
147+
148+
/*
149+
* The mapping is in an inconsistent state, and no further hooks
150+
* may be invoked upon it.
151+
*/
152+
vma->vm_ops = &vma_dummy_vm_ops;
153+
}
154+
}
155+
111156
#ifdef CONFIG_MMU
112157

113158
/* Flags for folio_pte_batch(). */
@@ -639,21 +684,21 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
639684
#endif
640685
}
641686

642-
void __folio_undo_large_rmappable(struct folio *folio);
643-
static inline void folio_undo_large_rmappable(struct folio *folio)
687+
bool __folio_unqueue_deferred_split(struct folio *folio);
688+
static inline bool folio_unqueue_deferred_split(struct folio *folio)
644689
{
645690
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
646-
return;
691+
return false;
647692

648693
/*
649694
* At this point, there is no one trying to add the folio to
650695
* deferred_list. If folio is not in deferred_list, it's safe
651696
* to check without acquiring the split_queue_lock.
652697
*/
653698
if (data_race(list_empty(&folio->_deferred_list)))
654-
return;
699+
return false;
655700

656-
__folio_undo_large_rmappable(folio);
701+
return __folio_unqueue_deferred_split(folio);
657702
}
658703

659704
static inline struct folio *page_rmappable_folio(struct page *page)

mm/memcontrol-v1.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
848848
css_get(&to->css);
849849
css_put(&from->css);
850850

851+
/* Warning should never happen, so don't worry about refcount non-0 */
852+
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
851853
folio->memcg_data = (unsigned long)to;
852854

853855
__folio_memcg_unlock(from);
@@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
12171219
enum mc_target_type target_type;
12181220
union mc_target target;
12191221
struct folio *folio;
1222+
bool tried_split_before = false;
12201223

1224+
retry_pmd:
12211225
ptl = pmd_trans_huge_lock(pmd, vma);
12221226
if (ptl) {
12231227
if (mc.precharge < HPAGE_PMD_NR) {
@@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
12271231
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
12281232
if (target_type == MC_TARGET_PAGE) {
12291233
folio = target.folio;
1234+
/*
1235+
* Deferred split queue locking depends on memcg,
1236+
* and unqueue is unsafe unless folio refcount is 0:
1237+
* split or skip if on the queue? first try to split.
1238+
*/
1239+
if (!list_empty(&folio->_deferred_list)) {
1240+
spin_unlock(ptl);
1241+
if (!tried_split_before)
1242+
split_folio(folio);
1243+
folio_unlock(folio);
1244+
folio_put(folio);
1245+
if (tried_split_before)
1246+
return 0;
1247+
tried_split_before = true;
1248+
goto retry_pmd;
1249+
}
1250+
/*
1251+
* So long as that pmd lock is held, the folio cannot
1252+
* be racily added to the _deferred_list, because
1253+
* __folio_remove_rmap() will find !partially_mapped.
1254+
*/
12301255
if (folio_isolate_lru(folio)) {
12311256
if (!mem_cgroup_move_account(folio, true,
12321257
mc.from, mc.to)) {

mm/memcontrol.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4629,10 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
46294629
struct obj_cgroup *objcg;
46304630

46314631
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4632-
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
4633-
!folio_test_hugetlb(folio) &&
4634-
!list_empty(&folio->_deferred_list) &&
4635-
folio_test_partially_mapped(folio), folio);
46364632

46374633
/*
46384634
* Nobody should be changing or seriously looking at
@@ -4679,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
46794675
ug->nr_memory += nr_pages;
46804676
ug->pgpgout++;
46814677

4678+
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
46824679
folio->memcg_data = 0;
46834680
}
46844681

@@ -4790,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
47904787

47914788
/* Transfer the charge and the css ref */
47924789
commit_charge(new, memcg);
4790+
4791+
/* Warning should never happen, so don't worry about refcount non-0 */
4792+
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
47934793
old->memcg_data = 0;
47944794
}
47954795

@@ -4976,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
49764976
VM_BUG_ON_FOLIO(oldid, folio);
49774977
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
49784978

4979+
folio_unqueue_deferred_split(folio);
49794980
folio->memcg_data = 0;
49804981

49814982
if (!mem_cgroup_is_root(memcg))

0 commit comments

Comments
 (0)