Skip to content

Commit f477619

Browse files
minatorvalds
authored andcommitted
hugetlb: add hugetlb.*.numa_stat file
For hugetlb backed jobs/VMs it's critical to understand the numa information for the memory backing these jobs to deliver optimal performance. Currently this technically can be queried from /proc/self/numa_maps, but there are significant issues with that. Namely: 1. Memory can be mapped or unmapped. 2. numa_maps are per process and need to be aggregated across all processes in the cgroup. For shared memory this is more involved as the userspace needs to make sure it doesn't double count shared mappings. 3. I believe querying numa_maps needs to hold the mmap_lock which adds to the contention on this lock. For these reasons I propose simply adding hugetlb.*.numa_stat file, which shows the numa information of the cgroup similarly to memory.numa_stat. On cgroup-v2: cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 On cgroup-v1: cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 hierarichal_total=2097152 N0=2097152 N1=0 This patch was tested manually by allocating hugetlb memory and querying the hugetlb.*.numa_stat file of the cgroup and its parents. [colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"] Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com [keescook@chromium.org: fix copy/paste array assignment] Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com Signed-off-by: Mina Almasry <almasrymina@google.com> Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Kees Cook <keescook@chromium.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Michal Hocko <mhocko@suse.com> Cc: David Rientjes <rientjes@google.com> Cc: Jue Wang <juew@google.com> Cc: Yang Yao <ygyao@google.com> Cc: Joanna Li <joannali@google.com> Cc: Cannon Matthews <cannonmatthews@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent c4dc63f commit f477619

File tree

5 files changed

+141
-12
lines changed

5 files changed

+141
-12
lines changed

Documentation/admin-guide/cgroup-v1/hugetlb.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,14 @@ Brief summary of control files::
2929
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
3030
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
3131
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
32+
hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup
3233

3334
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
3435
files include::
3536

3637
hugetlb.1GB.limit_in_bytes
3738
hugetlb.1GB.max_usage_in_bytes
39+
hugetlb.1GB.numa_stat
3840
hugetlb.1GB.usage_in_bytes
3941
hugetlb.1GB.failcnt
4042
hugetlb.1GB.rsvd.limit_in_bytes
@@ -43,6 +45,7 @@ files include::
4345
hugetlb.1GB.rsvd.failcnt
4446
hugetlb.64KB.limit_in_bytes
4547
hugetlb.64KB.max_usage_in_bytes
48+
hugetlb.64KB.numa_stat
4649
hugetlb.64KB.usage_in_bytes
4750
hugetlb.64KB.failcnt
4851
hugetlb.64KB.rsvd.limit_in_bytes
@@ -51,6 +54,7 @@ files include::
5154
hugetlb.64KB.rsvd.failcnt
5255
hugetlb.32MB.limit_in_bytes
5356
hugetlb.32MB.max_usage_in_bytes
57+
hugetlb.32MB.numa_stat
5458
hugetlb.32MB.usage_in_bytes
5559
hugetlb.32MB.failcnt
5660
hugetlb.32MB.rsvd.limit_in_bytes

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2266,6 +2266,11 @@ HugeTLB Interface Files
22662266
are local to the cgroup i.e. not hierarchical. The file modified event
22672267
generated on this file reflects only the local events.
22682268

2269+
hugetlb.<hugepagesize>.numa_stat
2270+
Similar to memory.numa_stat, it shows the numa information of the
2271+
hugetlb pages of <hugepagesize> in this cgroup. Only active in
2272+
use hugetlb pages are included. The per-node values are in bytes.
2273+
22692274
Misc
22702275
----
22712276

include/linux/hugetlb.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,8 +622,8 @@ struct hstate {
622622
#endif
623623
#ifdef CONFIG_CGROUP_HUGETLB
624624
/* cgroup control files */
625-
struct cftype cgroup_files_dfl[7];
626-
struct cftype cgroup_files_legacy[9];
625+
struct cftype cgroup_files_dfl[8];
626+
struct cftype cgroup_files_legacy[10];
627627
#endif
628628
char name[HSTATE_NAME_LEN];
629629
};

include/linux/hugetlb_cgroup.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ enum hugetlb_memory_event {
3636
HUGETLB_NR_MEMORY_EVENTS,
3737
};
3838

39+
struct hugetlb_cgroup_per_node {
40+
/* hugetlb usage in pages over all hstates. */
41+
unsigned long usage[HUGE_MAX_HSTATE];
42+
};
43+
3944
struct hugetlb_cgroup {
4045
struct cgroup_subsys_state css;
4146

@@ -57,6 +62,8 @@ struct hugetlb_cgroup {
5762

5863
/* Handle for "hugetlb.events.local" */
5964
struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
65+
66+
struct hugetlb_cgroup_per_node *nodeinfo[];
6067
};
6168

6269
static inline struct hugetlb_cgroup *

mm/hugetlb_cgroup.c

Lines changed: 123 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
123123
}
124124
}
125125

126+
static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
127+
{
128+
int node;
129+
130+
for_each_node(node)
131+
kfree(h_cgroup->nodeinfo[node]);
132+
kfree(h_cgroup);
133+
}
134+
126135
static struct cgroup_subsys_state *
127136
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
128137
{
129138
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
130139
struct hugetlb_cgroup *h_cgroup;
140+
int node;
141+
142+
h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
143+
GFP_KERNEL);
131144

132-
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
133145
if (!h_cgroup)
134146
return ERR_PTR(-ENOMEM);
135147

136148
if (!parent_h_cgroup)
137149
root_h_cgroup = h_cgroup;
138150

151+
/*
152+
* TODO: this routine can waste much memory for nodes which will
153+
* never be onlined. It's better to use memory hotplug callback
154+
* function.
155+
*/
156+
for_each_node(node) {
157+
/* Set node_to_alloc to -1 for offline nodes. */
158+
int node_to_alloc =
159+
node_state(node, N_NORMAL_MEMORY) ? node : -1;
160+
h_cgroup->nodeinfo[node] =
161+
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
162+
GFP_KERNEL, node_to_alloc);
163+
if (!h_cgroup->nodeinfo[node])
164+
goto fail_alloc_nodeinfo;
165+
}
166+
139167
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
140168
return &h_cgroup->css;
169+
170+
fail_alloc_nodeinfo:
171+
hugetlb_cgroup_free(h_cgroup);
172+
return ERR_PTR(-ENOMEM);
141173
}
142174

143175
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
144176
{
145-
struct hugetlb_cgroup *h_cgroup;
146-
147-
h_cgroup = hugetlb_cgroup_from_css(css);
148-
kfree(h_cgroup);
177+
hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
149178
}
150179

151180
/*
@@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
289318
return;
290319

291320
__set_hugetlb_cgroup(page, h_cg, rsvd);
292-
return;
321+
if (!rsvd) {
322+
unsigned long usage =
323+
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
324+
/*
325+
* This write is not atomic due to fetching usage and writing
326+
* to it, but that's fine because we call this with
327+
* hugetlb_lock held anyway.
328+
*/
329+
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
330+
usage + nr_pages);
331+
}
293332
}
294333

295334
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
@@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
328367

329368
if (rsvd)
330369
css_put(&h_cg->css);
331-
332-
return;
370+
else {
371+
unsigned long usage =
372+
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
373+
/*
374+
* This write is not atomic due to fetching usage and writing
375+
* to it, but that's fine because we call this with
376+
* hugetlb_lock held anyway.
377+
*/
378+
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
379+
usage - nr_pages);
380+
}
333381
}
334382

335383
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
@@ -418,6 +466,59 @@ enum {
418466
RES_RSVD_FAILCNT,
419467
};
420468

469+
static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
470+
{
471+
int nid;
472+
struct cftype *cft = seq_cft(seq);
473+
int idx = MEMFILE_IDX(cft->private);
474+
bool legacy = MEMFILE_ATTR(cft->private);
475+
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
476+
struct cgroup_subsys_state *css;
477+
unsigned long usage;
478+
479+
if (legacy) {
480+
/* Add up usage across all nodes for the non-hierarchical total. */
481+
usage = 0;
482+
for_each_node_state(nid, N_MEMORY)
483+
usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
484+
seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
485+
486+
/* Simply print the per-node usage for the non-hierarchical total. */
487+
for_each_node_state(nid, N_MEMORY)
488+
seq_printf(seq, " N%d=%lu", nid,
489+
READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
490+
PAGE_SIZE);
491+
seq_putc(seq, '\n');
492+
}
493+
494+
/*
495+
* The hierarchical total is pretty much the value recorded by the
496+
* counter, so use that.
497+
*/
498+
seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
499+
page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
500+
501+
/*
502+
* For each node, transverse the css tree to obtain the hierarchical
503+
* node usage.
504+
*/
505+
for_each_node_state(nid, N_MEMORY) {
506+
usage = 0;
507+
rcu_read_lock();
508+
css_for_each_descendant_pre(css, &h_cg->css) {
509+
usage += READ_ONCE(hugetlb_cgroup_from_css(css)
510+
->nodeinfo[nid]
511+
->usage[idx]);
512+
}
513+
rcu_read_unlock();
514+
seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
515+
}
516+
517+
seq_putc(seq, '\n');
518+
519+
return 0;
520+
}
521+
421522
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
422523
struct cftype *cft)
423524
{
@@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
668769
events_local_file[idx]);
669770
cft->flags = CFTYPE_NOT_ON_ROOT;
670771

671-
/* NULL terminate the last cft */
772+
/* Add the numa stat file */
672773
cft = &h->cgroup_files_dfl[6];
774+
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
775+
cft->seq_show = hugetlb_cgroup_read_numa_stat;
776+
cft->flags = CFTYPE_NOT_ON_ROOT;
777+
778+
/* NULL terminate the last cft */
779+
cft = &h->cgroup_files_dfl[7];
673780
memset(cft, 0, sizeof(*cft));
674781

675782
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
@@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
739846
cft->write = hugetlb_cgroup_reset;
740847
cft->read_u64 = hugetlb_cgroup_read_u64;
741848

742-
/* NULL terminate the last cft */
849+
/* Add the numa stat file */
743850
cft = &h->cgroup_files_legacy[8];
851+
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
852+
cft->private = MEMFILE_PRIVATE(idx, 1);
853+
cft->seq_show = hugetlb_cgroup_read_numa_stat;
854+
855+
/* NULL terminate the last cft */
856+
cft = &h->cgroup_files_legacy[9];
744857
memset(cft, 0, sizeof(*cft));
745858

746859
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,

0 commit comments

Comments
 (0)