Skip to content

Commit 33f96fe

Browse files
committed
Merge: mm: Remember a/d bits for migration entries
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/2294 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168392 Testing: KT1-lite + MM regressions as reported on the BZ ticket This work brings in upstream homonymous series [1] alongside its missing dependencies and required follow-ups to address the feature request expressed in Bug 2168392. This is a micro-optimization for the page migration path, which would unconditionally mark migrated pages as old & clean. This changeset teaches the migration code to remember these PTE bits from the old PTE and properly reset them after migrating the page over. For the generic use case, the bit loosing in the new PTE isn't really much of a problem, but there's measurable performance costs. For specific use cases (DEVICE MEMORY migrations comes to mind) this might represent potential problems, though, so we're bringing in the changeset for correctness and performance. [1] https://lore.kernel.org/all/20220811161331.37055-6-peterx@redhat.com/T/#me0e2db4c09f9a615cc7f7e21df37ce184f48571d Omitted-fix: bf2f34a ("LoongArch: Set _PAGE_DIRTY only if _PAGE_WRITE is set in {pmd,pte}_mkdirty()") Rafael Aquini (13): include/linux/swapops.h: remove stub for non_swap_entry() mm/swapops: make is_pmd_migration_entry more strict mm/x86: use SWP_TYPE_BITS in 3-level swap macros mm/swap: comment all the ifdef in swapops.h mm/swap: add swp_offset_pfn() to fetch PFN from swap entry mm/thp: carry over dirty bit when thp splits on pmd mm: remember young/dirty bit for page migrations mm/swap: cache maximum swapfile size when init swap mm/swap: cache swap migration A/D bits support mm/swap: fix SWP_PFN_BITS with CONFIG_PHYS_ADDR_T_64BIT on 32bit Partly revert "mm/thp: carry over dirty bit when thp splits on pmd" mm/thp: re-apply mkdirty for small pages after split mm/migrate: fix wrongly apply write bit after mkdirty on sparc64 Signed-off-by: Rafael Aquini <aquini@redhat.com> Approved-by: Nico Pache <npache@redhat.com> Approved-by: Waiman Long <longman@redhat.com> Approved-by: Donald Dutile <ddutile@redhat.com> Approved-by: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Jan Stancek <jstancek@redhat.com>
2 parents 80348d8 + 83e5bd4 commit 33f96fe

File tree

14 files changed

+232
-50
lines changed

14 files changed

+232
-50
lines changed

arch/arm64/mm/hugetlbpage.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
244244
{
245245
VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
246246

247-
return page_folio(pfn_to_page(swp_offset(entry)));
247+
return page_folio(pfn_to_page(swp_offset_pfn(entry)));
248248
}
249249

250250
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,

arch/x86/include/asm/pgtable-3level.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
256256
/* We always extract/encode the offset by shifting it all the way up, and then down again */
257257
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
258258

259-
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
260-
#define __swp_type(x) (((x).val) & 0x1f)
261-
#define __swp_offset(x) ((x).val >> 5)
262-
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
259+
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
260+
#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1))
261+
#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS)
262+
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS})
263263

264264
/*
265265
* Normally, __swp_entry() converts from arch-independent swp_entry_t to

arch/x86/mm/init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1054,7 +1054,7 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
10541054
}
10551055

10561056
#ifdef CONFIG_SWAP
1057-
unsigned long max_swapfile_size(void)
1057+
unsigned long arch_max_swapfile_size(void)
10581058
{
10591059
unsigned long pages;
10601060

fs/proc/task_mmu.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1410,9 +1410,19 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
14101410
if (pte_swp_uffd_wp(pte))
14111411
flags |= PM_UFFD_WP;
14121412
entry = pte_to_swp_entry(pte);
1413-
if (pm->show_pfn)
1413+
if (pm->show_pfn) {
1414+
pgoff_t offset;
1415+
/*
1416+
* For PFN swap offsets, keeping the offset field
1417+
* to be PFN only to be compatible with old smaps.
1418+
*/
1419+
if (is_pfn_swap_entry(entry))
1420+
offset = swp_offset_pfn(entry);
1421+
else
1422+
offset = swp_offset(entry);
14141423
frame = swp_type(entry) |
1415-
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1424+
(offset << MAX_SWAPFILES_SHIFT);
1425+
}
14161426
flags |= PM_SWAP;
14171427
migration = is_migration_entry(entry);
14181428
if (is_pfn_swap_entry(entry))
@@ -1467,7 +1477,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
14671477
unsigned long offset;
14681478

14691479
if (pm->show_pfn) {
1470-
offset = swp_offset(entry) +
1480+
if (is_pfn_swap_entry(entry))
1481+
offset = swp_offset_pfn(entry);
1482+
else
1483+
offset = swp_offset(entry);
1484+
offset = offset +
14711485
((addr & ~PMD_MASK) >> PAGE_SHIFT);
14721486
frame = swp_type(entry) |
14731487
(offset << MAX_SWAPFILES_SHIFT);

include/linux/swapfile.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
*/
99
extern struct swap_info_struct *swap_info[];
1010
extern unsigned long generic_max_swapfile_size(void);
11-
extern unsigned long max_swapfile_size(void);
11+
unsigned long arch_max_swapfile_size(void);
12+
13+
/* Maximum swapfile size supported for the arch (not inclusive). */
14+
extern unsigned long swapfile_maximum_size;
15+
/* Whether swap migration entry supports storing A/D bits for the arch */
16+
extern bool swap_migration_ad_supported;
1217

1318
#endif /* _LINUX_SWAPFILE_H */

include/linux/swapops.h

Lines changed: 134 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88

99
#ifdef CONFIG_MMU
1010

11+
#ifdef CONFIG_SWAP
12+
#include <linux/swapfile.h>
13+
#endif /* CONFIG_SWAP */
14+
1115
/*
1216
* swapcache pages are stored in the swapper_space radix tree. We want to
1317
* get good packing density in that tree, so the index should be dense in
@@ -23,6 +27,47 @@
2327
#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
2428
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
2529

30+
/*
31+
* Definitions only for PFN swap entries (see is_pfn_swap_entry()). To
32+
* store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries
33+
* can use the extra bits to store other information besides PFN.
34+
*/
35+
#ifdef MAX_PHYSMEM_BITS
36+
#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
37+
#else /* MAX_PHYSMEM_BITS */
38+
#define SWP_PFN_BITS min_t(int, \
39+
sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \
40+
SWP_TYPE_SHIFT)
41+
#endif /* MAX_PHYSMEM_BITS */
42+
#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1)
43+
44+
/**
45+
* Migration swap entry specific bitfield definitions. Layout:
46+
*
47+
* |----------+--------------------|
48+
* | swp_type | swp_offset |
49+
* |----------+--------+-+-+-------|
50+
* | | resv |D|A| PFN |
51+
* |----------+--------+-+-+-------|
52+
*
53+
* @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
54+
* @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
55+
*
56+
* Note: A/D bits will be stored in migration entries iff there're enough
57+
* free bits in arch specific swp offset. By default we'll ignore A/D bits
58+
* when migrating a page. Please refer to migration_entry_supports_ad()
59+
* for more information. If there're more bits besides PFN and A/D bits,
60+
* they should be reserved and always be zeros.
61+
*/
62+
#define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS)
63+
#define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1)
64+
#define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2)
65+
66+
#define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT)
67+
#define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT)
68+
69+
static inline bool is_pfn_swap_entry(swp_entry_t entry);
70+
2671
/* Clear all flags but only keep swp_entry_t related information */
2772
static inline pte_t pte_swp_clear_flags(pte_t pte)
2873
{
@@ -64,6 +109,17 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
64109
return entry.val & SWP_OFFSET_MASK;
65110
}
66111

112+
/*
113+
* This should only be called upon a pfn swap entry to get the PFN stored
114+
* in the swap entry. Please refers to is_pfn_swap_entry() for definition
115+
* of pfn swap entry.
116+
*/
117+
static inline unsigned long swp_offset_pfn(swp_entry_t entry)
118+
{
119+
VM_BUG_ON(!is_pfn_swap_entry(entry));
120+
return swp_offset(entry) & SWP_PFN_MASK;
121+
}
122+
67123
/* check whether a pte points to a swap entry */
68124
static inline int is_swap_pte(pte_t pte)
69125
{
@@ -240,15 +296,61 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
240296
return swp_entry(SWP_MIGRATION_WRITE, offset);
241297
}
242298

299+
/*
300+
* Returns whether the host has large enough swap offset field to support
301+
* carrying over pgtable A/D bits for page migrations. The result is
302+
* pretty much arch specific.
303+
*/
304+
static inline bool migration_entry_supports_ad(void)
305+
{
306+
#ifdef CONFIG_SWAP
307+
return swap_migration_ad_supported;
308+
#else /* CONFIG_SWAP */
309+
return false;
310+
#endif /* CONFIG_SWAP */
311+
}
312+
313+
static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
314+
{
315+
if (migration_entry_supports_ad())
316+
return swp_entry(swp_type(entry),
317+
swp_offset(entry) | SWP_MIG_YOUNG);
318+
return entry;
319+
}
320+
321+
static inline bool is_migration_entry_young(swp_entry_t entry)
322+
{
323+
if (migration_entry_supports_ad())
324+
return swp_offset(entry) & SWP_MIG_YOUNG;
325+
/* Keep the old behavior of aging page after migration */
326+
return false;
327+
}
328+
329+
static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
330+
{
331+
if (migration_entry_supports_ad())
332+
return swp_entry(swp_type(entry),
333+
swp_offset(entry) | SWP_MIG_DIRTY);
334+
return entry;
335+
}
336+
337+
static inline bool is_migration_entry_dirty(swp_entry_t entry)
338+
{
339+
if (migration_entry_supports_ad())
340+
return swp_offset(entry) & SWP_MIG_DIRTY;
341+
/* Keep the old behavior of clean page after migration */
342+
return false;
343+
}
344+
243345
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
244346
spinlock_t *ptl);
245347
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
246348
unsigned long address);
247349
#ifdef CONFIG_HUGETLB_PAGE
248350
extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
249351
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
250-
#endif
251-
#else
352+
#endif /* CONFIG_HUGETLB_PAGE */
353+
#else /* CONFIG_MIGRATION */
252354
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
253355
{
254356
return swp_entry(0, 0);
@@ -276,7 +378,7 @@ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
276378
#ifdef CONFIG_HUGETLB_PAGE
277379
static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
278380
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
279-
#endif
381+
#endif /* CONFIG_HUGETLB_PAGE */
280382
static inline int is_writable_migration_entry(swp_entry_t entry)
281383
{
282384
return 0;
@@ -286,7 +388,26 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
286388
return 0;
287389
}
288390

289-
#endif
391+
static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
392+
{
393+
return entry;
394+
}
395+
396+
static inline bool is_migration_entry_young(swp_entry_t entry)
397+
{
398+
return false;
399+
}
400+
401+
static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
402+
{
403+
return entry;
404+
}
405+
406+
static inline bool is_migration_entry_dirty(swp_entry_t entry)
407+
{
408+
return false;
409+
}
410+
#endif /* CONFIG_MIGRATION */
290411

291412
typedef unsigned long pte_marker;
292413

@@ -369,7 +490,7 @@ static inline int pte_none_mostly(pte_t pte)
369490

370491
static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
371492
{
372-
struct page *p = pfn_to_page(swp_offset(entry));
493+
struct page *p = pfn_to_page(swp_offset_pfn(entry));
373494

374495
/*
375496
* Any use of migration entries may only occur while the
@@ -387,6 +508,9 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
387508
*/
388509
static inline bool is_pfn_swap_entry(swp_entry_t entry)
389510
{
511+
/* Make sure the swp offset can always store the needed fields */
512+
BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
513+
390514
return is_migration_entry(entry) || is_device_private_entry(entry) ||
391515
is_device_exclusive_entry(entry);
392516
}
@@ -424,9 +548,9 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
424548

425549
static inline int is_pmd_migration_entry(pmd_t pmd)
426550
{
427-
return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
551+
return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
428552
}
429-
#else
553+
#else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
430554
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
431555
struct page *page)
432556
{
@@ -455,7 +579,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
455579
{
456580
return 0;
457581
}
458-
#endif
582+
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
459583

460584
#ifdef CONFIG_MEMORY_FAILURE
461585

@@ -475,11 +599,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
475599
return swp_type(entry) == SWP_HWPOISON;
476600
}
477601

478-
static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
479-
{
480-
return swp_offset(entry);
481-
}
482-
483602
static inline void num_poisoned_pages_inc(void)
484603
{
485604
atomic_long_inc(&num_poisoned_pages);
@@ -495,7 +614,7 @@ static inline void num_poisoned_pages_sub(long i)
495614
atomic_long_sub(i, &num_poisoned_pages);
496615
}
497616

498-
#else
617+
#else /* CONFIG_MEMORY_FAILURE */
499618

500619
static inline swp_entry_t make_hwpoison_entry(struct page *page)
501620
{
@@ -514,20 +633,12 @@ static inline void num_poisoned_pages_inc(void)
514633
static inline void num_poisoned_pages_sub(long i)
515634
{
516635
}
517-
#endif
636+
#endif /* CONFIG_MEMORY_FAILURE */
518637

519-
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
520-
defined(CONFIG_DEVICE_PRIVATE)
521638
static inline int non_swap_entry(swp_entry_t entry)
522639
{
523640
return swp_type(entry) >= MAX_SWAPFILES;
524641
}
525-
#else
526-
static inline int non_swap_entry(swp_entry_t entry)
527-
{
528-
return 0;
529-
}
530-
#endif
531642

532643
#endif /* CONFIG_MMU */
533644
#endif /* _LINUX_SWAPOPS_H */

mm/hmm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
253253
cpu_flags = HMM_PFN_VALID;
254254
if (is_writable_device_private_entry(entry))
255255
cpu_flags |= HMM_PFN_WRITE;
256-
*hmm_pfn = swp_offset(entry) | cpu_flags;
256+
*hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
257257
return 0;
258258
}
259259

0 commit comments

Comments
 (0)