Skip to content

Commit 5f9a4f4

Browse files
Muchun Songtorvalds
authored andcommitted
mm: memcontrol: add the missing numa_stat interface for cgroup v2
In the cgroup v1, we have a numa_stat interface. This is useful for providing visibility into the numa locality information within an memcg since the pages are allowed to be allocated from any physical node. One of the use cases is evaluating application performance by combining this information with the application's CPU allocation. But the cgroup v2 does not. So this patch adds the missing information. Suggested-by: Shakeel Butt <shakeelb@google.com> Signed-off-by: Muchun Song <songmuchun@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Cc: Zefan Li <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Roman Gushchin <guro@fb.com> Cc: Randy Dunlap <rdunlap@infradead.org> Link: https://lkml.kernel.org/r/20200916100030.71698-2-songmuchun@bytedance.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent bd0b230 commit 5f9a4f4

File tree

2 files changed

+159
-80
lines changed

2 files changed

+159
-80
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,6 +1259,10 @@ PAGE_SIZE multiple when read back.
12591259
can show up in the middle. Don't rely on items remaining in a
12601260
fixed position; use the keys to look up specific values!
12611261

1262+
If the entry has no per-node counter(or not show in the
1263+
mempry.numa_stat). We use 'npn'(non-per-node) as the tag
1264+
to indicate that it will not show in the mempry.numa_stat.
1265+
12621266
anon
12631267
Amount of memory used in anonymous mappings such as
12641268
brk(), sbrk(), and mmap(MAP_ANONYMOUS)
@@ -1270,15 +1274,11 @@ PAGE_SIZE multiple when read back.
12701274
kernel_stack
12711275
Amount of memory allocated to kernel stacks.
12721276

1273-
slab
1274-
Amount of memory used for storing in-kernel data
1275-
structures.
1276-
1277-
percpu
1277+
percpu(npn)
12781278
Amount of memory used for storing per-cpu kernel
12791279
data structures.
12801280

1281-
sock
1281+
sock(npn)
12821282
Amount of memory used in network transmission buffers
12831283

12841284
shmem
@@ -1318,11 +1318,9 @@ PAGE_SIZE multiple when read back.
13181318
Part of "slab" that cannot be reclaimed on memory
13191319
pressure.
13201320

1321-
pgfault
1322-
Total number of page faults incurred
1323-
1324-
pgmajfault
1325-
Number of major page faults incurred
1321+
slab(npn)
1322+
Amount of memory used for storing in-kernel data
1323+
structures.
13261324

13271325
workingset_refault_anon
13281326
Number of refaults of previously evicted anonymous pages.
@@ -1348,37 +1346,68 @@ PAGE_SIZE multiple when read back.
13481346
workingset_nodereclaim
13491347
Number of times a shadow node has been reclaimed
13501348

1351-
pgrefill
1349+
pgfault(npn)
1350+
Total number of page faults incurred
1351+
1352+
pgmajfault(npn)
1353+
Number of major page faults incurred
1354+
1355+
pgrefill(npn)
13521356
Amount of scanned pages (in an active LRU list)
13531357

1354-
pgscan
1358+
pgscan(npn)
13551359
Amount of scanned pages (in an inactive LRU list)
13561360

1357-
pgsteal
1361+
pgsteal(npn)
13581362
Amount of reclaimed pages
13591363

1360-
pgactivate
1364+
pgactivate(npn)
13611365
Amount of pages moved to the active LRU list
13621366

1363-
pgdeactivate
1367+
pgdeactivate(npn)
13641368
Amount of pages moved to the inactive LRU list
13651369

1366-
pglazyfree
1370+
pglazyfree(npn)
13671371
Amount of pages postponed to be freed under memory pressure
13681372

1369-
pglazyfreed
1373+
pglazyfreed(npn)
13701374
Amount of reclaimed lazyfree pages
13711375

1372-
thp_fault_alloc
1376+
thp_fault_alloc(npn)
13731377
Number of transparent hugepages which were allocated to satisfy
13741378
a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE
13751379
is not set.
13761380

1377-
thp_collapse_alloc
1381+
thp_collapse_alloc(npn)
13781382
Number of transparent hugepages which were allocated to allow
13791383
collapsing an existing range of pages. This counter is not
13801384
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
13811385

1386+
memory.numa_stat
1387+
A read-only nested-keyed file which exists on non-root cgroups.
1388+
1389+
This breaks down the cgroup's memory footprint into different
1390+
types of memory, type-specific details, and other information
1391+
per node on the state of the memory management system.
1392+
1393+
This is useful for providing visibility into the NUMA locality
1394+
information within an memcg since the pages are allowed to be
1395+
allocated from any physical node. One of the use case is evaluating
1396+
application performance by combining this information with the
1397+
application's CPU allocation.
1398+
1399+
All memory amounts are in bytes.
1400+
1401+
The output format of memory.numa_stat is::
1402+
1403+
type N0=<bytes in node 0> N1=<bytes in node 1> ...
1404+
1405+
The entries are ordered to be human readable, and new entries
1406+
can show up in the middle. Don't rely on items remaining in a
1407+
fixed position; use the keys to look up specific values!
1408+
1409+
The entries can refer to the memory.stat.
1410+
13821411
memory.swap.current
13831412
A read-only single value file which exists on non-root
13841413
cgroups.

mm/memcontrol.c

Lines changed: 110 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
14481448
return false;
14491449
}
14501450

1451+
struct memory_stat {
1452+
const char *name;
1453+
unsigned int ratio;
1454+
unsigned int idx;
1455+
};
1456+
1457+
static struct memory_stat memory_stats[] = {
1458+
{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
1459+
{ "file", PAGE_SIZE, NR_FILE_PAGES },
1460+
{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
1461+
{ "percpu", 1, MEMCG_PERCPU_B },
1462+
{ "sock", PAGE_SIZE, MEMCG_SOCK },
1463+
{ "shmem", PAGE_SIZE, NR_SHMEM },
1464+
{ "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
1465+
{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
1466+
{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
1467+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1468+
/*
1469+
* The ratio will be initialized in memory_stats_init(). Because
1470+
* on some architectures, the macro of HPAGE_PMD_SIZE is not
1471+
* constant(e.g. powerpc).
1472+
*/
1473+
{ "anon_thp", 0, NR_ANON_THPS },
1474+
#endif
1475+
{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
1476+
{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
1477+
{ "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
1478+
{ "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
1479+
{ "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
1480+
1481+
/*
1482+
* Note: The slab_reclaimable and slab_unreclaimable must be
1483+
* together and slab_reclaimable must be in front.
1484+
*/
1485+
{ "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
1486+
{ "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
1487+
1488+
/* The memory events */
1489+
{ "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
1490+
{ "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
1491+
{ "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
1492+
{ "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
1493+
{ "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
1494+
{ "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
1495+
{ "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
1496+
};
1497+
1498+
static int __init memory_stats_init(void)
1499+
{
1500+
int i;
1501+
1502+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1503+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1504+
if (memory_stats[i].idx == NR_ANON_THPS)
1505+
memory_stats[i].ratio = HPAGE_PMD_SIZE;
1506+
#endif
1507+
VM_BUG_ON(!memory_stats[i].ratio);
1508+
VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
1509+
}
1510+
1511+
return 0;
1512+
}
1513+
pure_initcall(memory_stats_init);
1514+
14511515
static char *memory_stat_format(struct mem_cgroup *memcg)
14521516
{
14531517
struct seq_buf s;
@@ -1468,75 +1532,26 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
14681532
* Current memory state:
14691533
*/
14701534

1471-
seq_buf_printf(&s, "anon %llu\n",
1472-
(u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
1473-
PAGE_SIZE);
1474-
seq_buf_printf(&s, "file %llu\n",
1475-
(u64)memcg_page_state(memcg, NR_FILE_PAGES) *
1476-
PAGE_SIZE);
1477-
seq_buf_printf(&s, "kernel_stack %llu\n",
1478-
(u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1479-
1024);
1480-
seq_buf_printf(&s, "slab %llu\n",
1481-
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1482-
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
1483-
seq_buf_printf(&s, "percpu %llu\n",
1484-
(u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
1485-
seq_buf_printf(&s, "sock %llu\n",
1486-
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
1487-
PAGE_SIZE);
1488-
1489-
seq_buf_printf(&s, "shmem %llu\n",
1490-
(u64)memcg_page_state(memcg, NR_SHMEM) *
1491-
PAGE_SIZE);
1492-
seq_buf_printf(&s, "file_mapped %llu\n",
1493-
(u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1494-
PAGE_SIZE);
1495-
seq_buf_printf(&s, "file_dirty %llu\n",
1496-
(u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1497-
PAGE_SIZE);
1498-
seq_buf_printf(&s, "file_writeback %llu\n",
1499-
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
1500-
PAGE_SIZE);
1501-
1502-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1503-
seq_buf_printf(&s, "anon_thp %llu\n",
1504-
(u64)memcg_page_state(memcg, NR_ANON_THPS) *
1505-
HPAGE_PMD_SIZE);
1506-
#endif
1535+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1536+
u64 size;
15071537

1508-
for (i = 0; i < NR_LRU_LISTS; i++)
1509-
seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
1510-
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1511-
PAGE_SIZE);
1538+
size = memcg_page_state(memcg, memory_stats[i].idx);
1539+
size *= memory_stats[i].ratio;
1540+
seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
15121541

1513-
seq_buf_printf(&s, "slab_reclaimable %llu\n",
1514-
(u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
1515-
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1516-
(u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
1542+
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1543+
size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1544+
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
1545+
seq_buf_printf(&s, "slab %llu\n", size);
1546+
}
1547+
}
15171548

15181549
/* Accumulated memory events */
15191550

15201551
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
15211552
memcg_events(memcg, PGFAULT));
15221553
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
15231554
memcg_events(memcg, PGMAJFAULT));
1524-
1525-
seq_buf_printf(&s, "workingset_refault_anon %lu\n",
1526-
memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
1527-
seq_buf_printf(&s, "workingset_refault_file %lu\n",
1528-
memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
1529-
seq_buf_printf(&s, "workingset_activate_anon %lu\n",
1530-
memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
1531-
seq_buf_printf(&s, "workingset_activate_file %lu\n",
1532-
memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
1533-
seq_buf_printf(&s, "workingset_restore_anon %lu\n",
1534-
memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
1535-
seq_buf_printf(&s, "workingset_restore_file %lu\n",
1536-
memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
1537-
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1538-
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1539-
15401555
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
15411556
memcg_events(memcg, PGREFILL));
15421557
seq_buf_printf(&s, "pgscan %lu\n",
@@ -6374,6 +6389,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
63746389
return 0;
63756390
}
63766391

6392+
#ifdef CONFIG_NUMA
6393+
static int memory_numa_stat_show(struct seq_file *m, void *v)
6394+
{
6395+
int i;
6396+
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6397+
6398+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6399+
int nid;
6400+
6401+
if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6402+
continue;
6403+
6404+
seq_printf(m, "%s", memory_stats[i].name);
6405+
for_each_node_state(nid, N_MEMORY) {
6406+
u64 size;
6407+
struct lruvec *lruvec;
6408+
6409+
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6410+
size = lruvec_page_state(lruvec, memory_stats[i].idx);
6411+
size *= memory_stats[i].ratio;
6412+
seq_printf(m, " N%d=%llu", nid, size);
6413+
}
6414+
seq_putc(m, '\n');
6415+
}
6416+
6417+
return 0;
6418+
}
6419+
#endif
6420+
63776421
static int memory_oom_group_show(struct seq_file *m, void *v)
63786422
{
63796423
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6451,6 +6495,12 @@ static struct cftype memory_files[] = {
64516495
.name = "stat",
64526496
.seq_show = memory_stat_show,
64536497
},
6498+
#ifdef CONFIG_NUMA
6499+
{
6500+
.name = "numa_stat",
6501+
.seq_show = memory_numa_stat_show,
6502+
},
6503+
#endif
64546504
{
64556505
.name = "oom.group",
64566506
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,

0 commit comments

Comments
 (0)