Skip to content

Commit 4c21e2f

Browse files
Hugh DickinsLinus Torvalds
authored andcommitted
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1 parent b38c684 commit 4c21e2f

File tree

23 files changed

+138
-79
lines changed

23 files changed

+138
-79
lines changed

arch/arm/mm/mm-armv.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
229229
pte = pmd_page(*pmd);
230230
pmd_clear(pmd);
231231
dec_page_state(nr_page_table_pages);
232+
pte_lock_deinit(pte);
232233
pte_free(pte);
233234
pmd_free(pmd);
234235
free:

arch/frv/mm/pgalloc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
8787
if (pgd_list)
8888
pgd_list->private = (unsigned long) &page->index;
8989
pgd_list = page;
90-
page->private = (unsigned long) &pgd_list;
90+
set_page_private(page, (unsigned long)&pgd_list);
9191
}
9292

9393
static inline void pgd_list_del(pgd_t *pgd)
9494
{
9595
struct page *next, **pprev, *page = virt_to_page(pgd);
9696
next = (struct page *) page->index;
97-
pprev = (struct page **) page->private;
97+
pprev = (struct page **)page_private(page);
9898
*pprev = next;
9999
if (next)
100100
next->private = (unsigned long) pprev;

arch/i386/mm/pgtable.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
188188
struct page *page = virt_to_page(pgd);
189189
page->index = (unsigned long)pgd_list;
190190
if (pgd_list)
191-
pgd_list->private = (unsigned long)&page->index;
191+
set_page_private(pgd_list, (unsigned long)&page->index);
192192
pgd_list = page;
193-
page->private = (unsigned long)&pgd_list;
193+
set_page_private(page, (unsigned long)&pgd_list);
194194
}
195195

196196
static inline void pgd_list_del(pgd_t *pgd)
197197
{
198198
struct page *next, **pprev, *page = virt_to_page(pgd);
199199
next = (struct page *)page->index;
200-
pprev = (struct page **)page->private;
200+
pprev = (struct page **)page_private(page);
201201
*pprev = next;
202202
if (next)
203-
next->private = (unsigned long)pprev;
203+
set_page_private(next, (unsigned long)pprev);
204204
}
205205

206206
void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)

arch/um/kernel/skas/mmu.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
144144

145145
if(!proc_mm || !ptrace_faultinfo){
146146
free_page(mmu->id.stack);
147+
pte_lock_deinit(virt_to_page(mmu->last_page_table));
147148
pte_free_kernel((pte_t *) mmu->last_page_table);
148149
dec_page_state(nr_page_table_pages);
149150
#ifdef CONFIG_3_LEVEL_PGTABLES

fs/afs/file.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
291291
cachefs_uncache_page(vnode->cache, page);
292292
#endif
293293

294-
pageio = (struct cachefs_page *) page->private;
295-
page->private = 0;
294+
pageio = (struct cachefs_page *) page_private(page);
295+
set_page_private(page, 0);
296296
ClearPagePrivate(page);
297297

298298
if (pageio)

fs/buffer.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ static void
9696
__clear_page_buffers(struct page *page)
9797
{
9898
ClearPagePrivate(page);
99-
page->private = 0;
99+
set_page_private(page, 0);
100100
page_cache_release(page);
101101
}
102102

fs/jfs/jfs_metapage.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ struct meta_anchor {
8686
atomic_t io_count;
8787
struct metapage *mp[MPS_PER_PAGE];
8888
};
89-
#define mp_anchor(page) ((struct meta_anchor *)page->private)
89+
#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
9090

9191
static inline struct metapage *page_to_mp(struct page *page, uint offset)
9292
{
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
108108
if (!a)
109109
return -ENOMEM;
110110
memset(a, 0, sizeof(struct meta_anchor));
111-
page->private = (unsigned long)a;
111+
set_page_private(page, (unsigned long)a);
112112
SetPagePrivate(page);
113113
kmap(page);
114114
}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
136136
a->mp[index] = NULL;
137137
if (--a->mp_count == 0) {
138138
kfree(a);
139-
page->private = 0;
139+
set_page_private(page, 0);
140140
ClearPagePrivate(page);
141141
kunmap(page);
142142
}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
156156
#else
157157
static inline struct metapage *page_to_mp(struct page *page, uint offset)
158158
{
159-
return PagePrivate(page) ? (struct metapage *)page->private : NULL;
159+
return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
160160
}
161161

162162
static inline int insert_metapage(struct page *page, struct metapage *mp)
163163
{
164164
if (mp) {
165-
page->private = (unsigned long)mp;
165+
set_page_private(page, (unsigned long)mp);
166166
SetPagePrivate(page);
167167
kmap(page);
168168
}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
171171

172172
static inline void remove_metapage(struct page *page, struct metapage *mp)
173173
{
174-
page->private = 0;
174+
set_page_private(page, 0);
175175
ClearPagePrivate(page);
176176
kunmap(page);
177177
}

fs/xfs/linux-2.6/xfs_buf.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,9 @@ set_page_region(
181181
size_t offset,
182182
size_t length)
183183
{
184-
page->private |= page_region_mask(offset, length);
185-
if (page->private == ~0UL)
184+
set_page_private(page,
185+
page_private(page) | page_region_mask(offset, length));
186+
if (page_private(page) == ~0UL)
186187
SetPageUptodate(page);
187188
}
188189

@@ -194,7 +195,7 @@ test_page_region(
194195
{
195196
unsigned long mask = page_region_mask(offset, length);
196197

197-
return (mask && (page->private & mask) == mask);
198+
return (mask && (page_private(page) & mask) == mask);
198199
}
199200

200201
/*

include/linux/buffer_head.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
126126
/* If we *know* page->private refers to buffer_heads */
127127
#define page_buffers(page) \
128128
({ \
129-
BUG_ON(!PagePrivate(page)); \
130-
((struct buffer_head *)(page)->private); \
129+
BUG_ON(!PagePrivate(page)); \
130+
((struct buffer_head *)page_private(page)); \
131131
})
132132
#define page_has_buffers(page) PagePrivate(page)
133133

@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
219219
{
220220
page_cache_get(page);
221221
SetPagePrivate(page);
222-
page->private = (unsigned long)head;
222+
set_page_private(page, (unsigned long)head);
223223
}
224224

225225
static inline void get_bh(struct buffer_head *bh)

include/linux/mm.h

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,18 @@ struct page {
226226
* to show when page is mapped
227227
* & limit reverse map searches.
228228
*/
229-
unsigned long private; /* Mapping-private opaque data:
229+
union {
230+
unsigned long private; /* Mapping-private opaque data:
230231
* usually used for buffer_heads
231232
* if PagePrivate set; used for
232233
* swp_entry_t if PageSwapCache
233234
* When page is free, this indicates
234235
* order in the buddy system.
235236
*/
237+
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
238+
spinlock_t ptl;
239+
#endif
240+
} u;
236241
struct address_space *mapping; /* If low bit clear, points to
237242
* inode address_space, or NULL.
238243
* If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
260265
#endif /* WANT_PAGE_VIRTUAL */
261266
};
262267

268+
#define page_private(page) ((page)->u.private)
269+
#define set_page_private(page, v) ((page)->u.private = (v))
270+
263271
/*
264272
* FIXME: take this include out, include page-flags.h in
265273
* files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
311319

312320
#ifdef CONFIG_HUGETLB_PAGE
313321

314-
static inline int page_count(struct page *p)
322+
static inline int page_count(struct page *page)
315323
{
316-
if (PageCompound(p))
317-
p = (struct page *)p->private;
318-
return atomic_read(&(p)->_count) + 1;
324+
if (PageCompound(page))
325+
page = (struct page *)page_private(page);
326+
return atomic_read(&page->_count) + 1;
319327
}
320328

321329
static inline void get_page(struct page *page)
322330
{
323331
if (unlikely(PageCompound(page)))
324-
page = (struct page *)page->private;
332+
page = (struct page *)page_private(page);
325333
atomic_inc(&page->_count);
326334
}
327335

@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
587595
static inline pgoff_t page_index(struct page *page)
588596
{
589597
if (unlikely(PageSwapCache(page)))
590-
return page->private;
598+
return page_private(page);
591599
return page->index;
592600
}
593601

@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
779787
}
780788
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
781789

790+
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
791+
/*
792+
* We tuck a spinlock to guard each pagetable page into its struct page,
793+
* at page->private, with BUILD_BUG_ON to make sure that this will not
794+
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
795+
* When freeing, reset page->mapping so free_pages_check won't complain.
796+
*/
797+
#define __pte_lockptr(page) &((page)->u.ptl)
798+
#define pte_lock_init(_page) do { \
799+
spin_lock_init(__pte_lockptr(_page)); \
800+
} while (0)
801+
#define pte_lock_deinit(page) ((page)->mapping = NULL)
802+
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
803+
#else
804+
/*
805+
* We use mm->page_table_lock to guard all pagetable pages of the mm.
806+
*/
807+
#define pte_lock_init(page) do {} while (0)
808+
#define pte_lock_deinit(page) do {} while (0)
809+
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
810+
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
811+
782812
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
783813
({ \
784-
spinlock_t *__ptl = &(mm)->page_table_lock; \
814+
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
785815
pte_t *__pte = pte_offset_map(pmd, address); \
786816
*(ptlp) = __ptl; \
787817
spin_lock(__ptl); \

0 commit comments

Comments
 (0)