Skip to content

Commit 4037d45

Browse files
Christoph LameterLinus Torvalds
authored andcommitted
Move remote node draining out of slab allocators
Currently the slab allocators contain callbacks into the page allocator to perform the draining of pagesets on remote nodes. This requires SLUB to have a whole subsystem in order to be compatible with SLAB. Moving node draining out of the slab allocators avoids a section of code in SLUB. Move the node draining so that is is done when the vm statistics are updated. At that point we are already touching all the cachelines with the pagesets of a processor. Add a expire counter there. If we have to update per zone or global vm statistics then assume that the pageset will require subsequent draining. The expire counter will be decremented on each vm stats update pass until it reaches zero. Then we will drain one batch from the pageset. The draining will cause vm counter updates which will then cause another expiration until the pcp is empty. So we will drain a batch every 3 seconds. Note that remote node draining is a somewhat esoteric feature that is required on large NUMA systems because otherwise significant portions of system memory can become trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 77461ab commit 4037d45

File tree

6 files changed

+67
-131
lines changed

6 files changed

+67
-131
lines changed

include/linux/gfp.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
176176
#define free_page(addr) free_pages((addr),0)
177177

178178
void page_alloc_init(void);
179-
#ifdef CONFIG_NUMA
180-
void drain_node_pages(int node);
181-
#else
182-
static inline void drain_node_pages(int node) { };
183-
#endif
179+
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
184180

185181
#endif /* __LINUX_GFP_H */

include/linux/mmzone.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ struct per_cpu_pages {
8383

8484
struct per_cpu_pageset {
8585
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
86+
#ifdef CONFIG_NUMA
87+
s8 expire;
88+
#endif
8689
#ifdef CONFIG_SMP
8790
s8 stat_threshold;
8891
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];

mm/page_alloc.c

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
691691

692692
#ifdef CONFIG_NUMA
693693
/*
694-
* Called from the slab reaper to drain pagesets on a particular node that
695-
* belongs to the currently executing processor.
694+
* Called from the vmstat counter updater to drain pagesets of this
695+
* currently executing processor on remote nodes after they have
696+
* expired.
697+
*
696698
* Note that this function must be called with the thread pinned to
697699
* a single processor.
698700
*/
699-
void drain_node_pages(int nodeid)
701+
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
700702
{
701-
int i;
702-
enum zone_type z;
703703
unsigned long flags;
704+
int to_drain;
704705

705-
for (z = 0; z < MAX_NR_ZONES; z++) {
706-
struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
707-
struct per_cpu_pageset *pset;
708-
709-
if (!populated_zone(zone))
710-
continue;
711-
712-
pset = zone_pcp(zone, smp_processor_id());
713-
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714-
struct per_cpu_pages *pcp;
715-
716-
pcp = &pset->pcp[i];
717-
if (pcp->count) {
718-
int to_drain;
719-
720-
local_irq_save(flags);
721-
if (pcp->count >= pcp->batch)
722-
to_drain = pcp->batch;
723-
else
724-
to_drain = pcp->count;
725-
free_pages_bulk(zone, to_drain, &pcp->list, 0);
726-
pcp->count -= to_drain;
727-
local_irq_restore(flags);
728-
}
729-
}
730-
}
706+
local_irq_save(flags);
707+
if (pcp->count >= pcp->batch)
708+
to_drain = pcp->batch;
709+
else
710+
to_drain = pcp->count;
711+
free_pages_bulk(zone, to_drain, &pcp->list, 0);
712+
pcp->count -= to_drain;
713+
local_irq_restore(flags);
731714
}
732715
#endif
733716

mm/slab.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -928,12 +928,6 @@ static void next_reap_node(void)
928928
{
929929
int node = __get_cpu_var(reap_node);
930930

931-
/*
932-
* Also drain per cpu pages on remote zones
933-
*/
934-
if (node != numa_node_id())
935-
drain_node_pages(node);
936-
937931
node = next_node(node, node_online_map);
938932
if (unlikely(node >= MAX_NUMNODES))
939933
node = first_node(node_online_map);

mm/slub.c

Lines changed: 0 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
25302530

25312531
#endif
25322532

2533-
#ifdef CONFIG_NUMA
2534-
2535-
/*****************************************************************
2536-
* Generic reaper used to support the page allocator
2537-
* (the cpu slabs are reaped by a per slab workqueue).
2538-
*
2539-
* Maybe move this to the page allocator?
2540-
****************************************************************/
2541-
2542-
static DEFINE_PER_CPU(unsigned long, reap_node);
2543-
2544-
static void init_reap_node(int cpu)
2545-
{
2546-
int node;
2547-
2548-
node = next_node(cpu_to_node(cpu), node_online_map);
2549-
if (node == MAX_NUMNODES)
2550-
node = first_node(node_online_map);
2551-
2552-
__get_cpu_var(reap_node) = node;
2553-
}
2554-
2555-
static void next_reap_node(void)
2556-
{
2557-
int node = __get_cpu_var(reap_node);
2558-
2559-
/*
2560-
* Also drain per cpu pages on remote zones
2561-
*/
2562-
if (node != numa_node_id())
2563-
drain_node_pages(node);
2564-
2565-
node = next_node(node, node_online_map);
2566-
if (unlikely(node >= MAX_NUMNODES))
2567-
node = first_node(node_online_map);
2568-
__get_cpu_var(reap_node) = node;
2569-
}
2570-
#else
2571-
#define init_reap_node(cpu) do { } while (0)
2572-
#define next_reap_node(void) do { } while (0)
2573-
#endif
2574-
2575-
#define REAPTIMEOUT_CPUC (2*HZ)
2576-
2577-
#ifdef CONFIG_SMP
2578-
static DEFINE_PER_CPU(struct delayed_work, reap_work);
2579-
2580-
static void cache_reap(struct work_struct *unused)
2581-
{
2582-
next_reap_node();
2583-
schedule_delayed_work(&__get_cpu_var(reap_work),
2584-
REAPTIMEOUT_CPUC);
2585-
}
2586-
2587-
static void __devinit start_cpu_timer(int cpu)
2588-
{
2589-
struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2590-
2591-
/*
2592-
* When this gets called from do_initcalls via cpucache_init(),
2593-
* init_workqueues() has already run, so keventd will be setup
2594-
* at that time.
2595-
*/
2596-
if (keventd_up() && reap_work->work.func == NULL) {
2597-
init_reap_node(cpu);
2598-
INIT_DELAYED_WORK(reap_work, cache_reap);
2599-
schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2600-
}
2601-
}
2602-
2603-
static int __init cpucache_init(void)
2604-
{
2605-
int cpu;
2606-
2607-
/*
2608-
* Register the timers that drain pcp pages and update vm statistics
2609-
*/
2610-
for_each_online_cpu(cpu)
2611-
start_cpu_timer(cpu);
2612-
return 0;
2613-
}
2614-
__initcall(cpucache_init);
2615-
#endif
2616-
26172533
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
26182534
{
26192535
struct kmem_cache *s = get_slab(size, gfpflags);

mm/vmstat.c

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281281

282282
/*
283283
* Update the zone counters for one cpu.
284+
*
285+
* Note that refresh_cpu_vm_stats strives to only access
286+
* node local memory. The per cpu pagesets on remote zones are placed
287+
* in the memory local to the processor using that pageset. So the
288+
* loop over all zones will access a series of cachelines local to
289+
* the processor.
290+
*
291+
* The call to zone_page_state_add updates the cachelines with the
292+
* statistics in the remote zone struct as well as the global cachelines
293+
* with the global counters. These could cause remote node cache line
294+
* bouncing and will have to be only done when necessary.
284295
*/
285296
void refresh_cpu_vm_stats(int cpu)
286297
{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289300
unsigned long flags;
290301

291302
for_each_zone(zone) {
292-
struct per_cpu_pageset *pcp;
303+
struct per_cpu_pageset *p;
293304

294305
if (!populated_zone(zone))
295306
continue;
296307

297-
pcp = zone_pcp(zone, cpu);
308+
p = zone_pcp(zone, cpu);
298309

299310
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300-
if (pcp->vm_stat_diff[i]) {
311+
if (p->vm_stat_diff[i]) {
301312
local_irq_save(flags);
302-
zone_page_state_add(pcp->vm_stat_diff[i],
313+
zone_page_state_add(p->vm_stat_diff[i],
303314
zone, i);
304-
pcp->vm_stat_diff[i] = 0;
315+
p->vm_stat_diff[i] = 0;
316+
#ifdef CONFIG_NUMA
317+
/* 3 seconds idle till flush */
318+
p->expire = 3;
319+
#endif
305320
local_irq_restore(flags);
306321
}
322+
#ifdef CONFIG_NUMA
323+
/*
324+
* Deal with draining the remote pageset of this
325+
* processor
326+
*
327+
* Check if there are pages remaining in this pageset
328+
* if not then there is nothing to expire.
329+
*/
330+
if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
331+
continue;
332+
333+
/*
334+
* We never drain zones local to this processor.
335+
*/
336+
if (zone_to_nid(zone) == numa_node_id()) {
337+
p->expire = 0;
338+
continue;
339+
}
340+
341+
p->expire--;
342+
if (p->expire)
343+
continue;
344+
345+
if (p->pcp[0].count)
346+
drain_zone_pages(zone, p->pcp + 0);
347+
348+
if (p->pcp[1].count)
349+
drain_zone_pages(zone, p->pcp + 1);
350+
#endif
307351
}
308352
}
309353

0 commit comments

Comments
 (0)