Skip to content

Commit 7cc36bb

Browse files
Christoph Lametertorvalds
authored andcommitted
vmstat: on-demand vmstat workers V8
vmstat workers are used for folding counter differentials into the zone, per node and global counters at certain time intervals. They currently run at defined intervals on all processors which will cause some holdoff for processors that need minimal intrusion by the OS. The current vmstat_update mechanism depends on a deferrable timer firing every other second by default which registers a work queue item that runs on the local CPU, with the result that we have 1 interrupt and one additional schedulable task on each CPU every 2 seconds If a workload indeed causes VM activity or multiple tasks are running on a CPU, then there are probably bigger issues to deal with. However, some workloads dedicate a CPU for a single CPU bound task. This is done in high performance computing, in high frequency financial applications, in networking (Intel DPDK, EZchip NPS) and with the advent of systems with more and more CPUs over time, this may become more and more common to do since when one has enough CPUs one cares less about efficiently sharing a CPU with other tasks and more about efficiently monopolizing a CPU per task. The difference of having this timer firing and workqueue kernel thread scheduled per second can be enormous. An artificial test measuring the worst case time to do a simple "i++" in an endless loop on a bare metal system and under Linux on an isolated CPU with dynticks and with and without this patch, have Linux match the bare metal performance (~700 cycles) with this patch and loose by couple of orders of magnitude (~200k cycles) without it[*]. The loss occurs for something that just calculates statistics. For networking applications, for example, this could be the difference between dropping packets or sustaining line rate. Statistics are important and useful, but it would be great if there would be a way to not cause statistics gathering produce a huge performance difference. This patche does just that. This patch creates a vmstat shepherd worker that monitors the per cpu differentials on all processors. If there are differentials on a processor then a vmstat worker local to the processors with the differentials is created. That worker will then start folding the diffs in regular intervals. Should the worker find that there is no work to be done then it will make the shepherd worker monitor the differentials again. With this patch it is possible then to have periods longer than 2 seconds without any OS event on a "cpu" (hardware thread). The patch shows a very minor increased in system performance. hackbench -s 512 -l 2000 -g 15 -f 25 -P Results before the patch: Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.992 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.971 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 5.063 Hackbench after the patch: Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.973 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.990 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.993 [fengguang.wu@intel.com: cpu_stat_off can be static] Signed-off-by: Christoph Lameter <cl@linux.com> Reviewed-by: Gilad Ben-Yossef <gilad@benyossef.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tejun Heo <tj@kernel.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Max Krasnyansky <maxk@qti.qualcomm.com> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Hugh Dickins <hughd@google.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent f0d6d1f commit 7cc36bb

File tree

1 file changed

+120
-21
lines changed

1 file changed

+120
-21
lines changed

mm/vmstat.c

Lines changed: 120 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
* zoned VM statistics
88
* Copyright (C) 2006 Silicon Graphics, Inc.,
99
* Christoph Lameter <christoph@lameter.com>
10+
* Copyright (C) 2008-2014 Christoph Lameter
1011
*/
1112
#include <linux/fs.h>
1213
#include <linux/mm.h>
1314
#include <linux/err.h>
1415
#include <linux/module.h>
1516
#include <linux/slab.h>
1617
#include <linux/cpu.h>
18+
#include <linux/cpumask.h>
1719
#include <linux/vmstat.h>
1820
#include <linux/sched.h>
1921
#include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
419421
EXPORT_SYMBOL(dec_zone_page_state);
420422
#endif
421423

422-
static inline void fold_diff(int *diff)
424+
425+
/*
426+
* Fold a differential into the global counters.
427+
* Returns the number of counters updated.
428+
*/
429+
static int fold_diff(int *diff)
423430
{
424431
int i;
432+
int changes = 0;
425433

426434
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
427-
if (diff[i])
435+
if (diff[i]) {
428436
atomic_long_add(diff[i], &vm_stat[i]);
437+
changes++;
438+
}
439+
return changes;
429440
}
430441

431442
/*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
441452
* statistics in the remote zone struct as well as the global cachelines
442453
* with the global counters. These could cause remote node cache line
443454
* bouncing and will have to be only done when necessary.
455+
*
456+
* The function returns the number of global counters updated.
444457
*/
445-
static void refresh_cpu_vm_stats(void)
458+
static int refresh_cpu_vm_stats(void)
446459
{
447460
struct zone *zone;
448461
int i;
449462
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
463+
int changes = 0;
450464

451465
for_each_populated_zone(zone) {
452466
struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
486500
continue;
487501
}
488502

489-
490503
if (__this_cpu_dec_return(p->expire))
491504
continue;
492505

493-
if (__this_cpu_read(p->pcp.count))
506+
if (__this_cpu_read(p->pcp.count)) {
494507
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
508+
changes++;
509+
}
495510
#endif
496511
}
497-
fold_diff(global_diff);
512+
changes += fold_diff(global_diff);
513+
return changes;
498514
}
499515

500516
/*
@@ -1239,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = {
12391255
#ifdef CONFIG_SMP
12401256
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
12411257
int sysctl_stat_interval __read_mostly = HZ;
1258+
static cpumask_var_t cpu_stat_off;
12421259

12431260
static void vmstat_update(struct work_struct *w)
12441261
{
1245-
refresh_cpu_vm_stats();
1246-
schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1262+
if (refresh_cpu_vm_stats())
1263+
/*
1264+
* Counters were updated so we expect more updates
1265+
* to occur in the future. Keep on running the
1266+
* update worker thread.
1267+
*/
1268+
schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1269+
round_jiffies_relative(sysctl_stat_interval));
1270+
else {
1271+
/*
1272+
* We did not update any counters so the app may be in
1273+
* a mode where it does not cause counter updates.
1274+
* We may be uselessly running vmstat_update.
1275+
* Defer the checking for differentials to the
1276+
* shepherd thread on a different processor.
1277+
*/
1278+
int r;
1279+
/*
1280+
* Shepherd work thread does not race since it never
1281+
* changes the bit if its zero but the cpu
1282+
* online / off line code may race if
1283+
* worker threads are still allowed during
1284+
* shutdown / startup.
1285+
*/
1286+
r = cpumask_test_and_set_cpu(smp_processor_id(),
1287+
cpu_stat_off);
1288+
VM_BUG_ON(r);
1289+
}
1290+
}
1291+
1292+
/*
1293+
* Check if the diffs for a certain cpu indicate that
1294+
* an update is needed.
1295+
*/
1296+
static bool need_update(int cpu)
1297+
{
1298+
struct zone *zone;
1299+
1300+
for_each_populated_zone(zone) {
1301+
struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1302+
1303+
BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1304+
/*
1305+
* The fast way of checking if there are any vmstat diffs.
1306+
* This works because the diffs are byte sized items.
1307+
*/
1308+
if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1309+
return true;
1310+
1311+
}
1312+
return false;
1313+
}
1314+
1315+
1316+
/*
1317+
* Shepherd worker thread that checks the
1318+
* differentials of processors that have their worker
1319+
* threads for vm statistics updates disabled because of
1320+
* inactivity.
1321+
*/
1322+
static void vmstat_shepherd(struct work_struct *w);
1323+
1324+
static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
1325+
1326+
static void vmstat_shepherd(struct work_struct *w)
1327+
{
1328+
int cpu;
1329+
1330+
get_online_cpus();
1331+
/* Check processors whose vmstat worker threads have been disabled */
1332+
for_each_cpu(cpu, cpu_stat_off)
1333+
if (need_update(cpu) &&
1334+
cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1335+
1336+
schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
1337+
__round_jiffies_relative(sysctl_stat_interval, cpu));
1338+
1339+
put_online_cpus();
1340+
1341+
schedule_delayed_work(&shepherd,
12471342
round_jiffies_relative(sysctl_stat_interval));
1343+
12481344
}
12491345

1250-
static void start_cpu_timer(int cpu)
1346+
static void __init start_shepherd_timer(void)
12511347
{
1252-
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
1348+
int cpu;
1349+
1350+
for_each_possible_cpu(cpu)
1351+
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1352+
vmstat_update);
1353+
1354+
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
1355+
BUG();
1356+
cpumask_copy(cpu_stat_off, cpu_online_mask);
12531357

1254-
INIT_DEFERRABLE_WORK(work, vmstat_update);
1255-
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1358+
schedule_delayed_work(&shepherd,
1359+
round_jiffies_relative(sysctl_stat_interval));
12561360
}
12571361

12581362
static void vmstat_cpu_dead(int node)
@@ -1283,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
12831387
case CPU_ONLINE:
12841388
case CPU_ONLINE_FROZEN:
12851389
refresh_zone_stat_thresholds();
1286-
start_cpu_timer(cpu);
12871390
node_set_state(cpu_to_node(cpu), N_CPU);
1391+
cpumask_set_cpu(cpu, cpu_stat_off);
12881392
break;
12891393
case CPU_DOWN_PREPARE:
12901394
case CPU_DOWN_PREPARE_FROZEN:
12911395
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1292-
per_cpu(vmstat_work, cpu).work.func = NULL;
1396+
cpumask_clear_cpu(cpu, cpu_stat_off);
12931397
break;
12941398
case CPU_DOWN_FAILED:
12951399
case CPU_DOWN_FAILED_FROZEN:
1296-
start_cpu_timer(cpu);
1400+
cpumask_set_cpu(cpu, cpu_stat_off);
12971401
break;
12981402
case CPU_DEAD:
12991403
case CPU_DEAD_FROZEN:
@@ -1313,15 +1417,10 @@ static struct notifier_block vmstat_notifier =
13131417
static int __init setup_vmstat(void)
13141418
{
13151419
#ifdef CONFIG_SMP
1316-
int cpu;
1317-
13181420
cpu_notifier_register_begin();
13191421
__register_cpu_notifier(&vmstat_notifier);
13201422

1321-
for_each_online_cpu(cpu) {
1322-
start_cpu_timer(cpu);
1323-
node_set_state(cpu_to_node(cpu), N_CPU);
1324-
}
1423+
start_shepherd_timer();
13251424
cpu_notifier_register_done();
13261425
#endif
13271426
#ifdef CONFIG_PROC_FS

0 commit comments

Comments
 (0)