Skip to content

Commit 49224a3

Browse files
committed
Merge patch series "nsfs: iterate through mount namespaces"
Christian Brauner <brauner@kernel.org> says: Recently, we added the ability to list mounts in other mount namespaces and the ability to retrieve namespace file descriptors without having to go through procfs by deriving them from pidfds. This extends nsfs in two ways: (1) Add the ability to retrieve information about a mount namespace via NS_MNT_GET_INFO. This will return the mount namespace id and the number of mounts currently in the mount namespace. The number of mounts can be used to size the buffer that needs to be used for listmount() and is in general useful without having to actually iterate through all the mounts. The structure is extensible. (2) Add the ability to iterate through all mount namespaces over which the caller holds privilege returning the file descriptor for the next or previous mount namespace. To retrieve a mount namespace the caller must be privileged wrt to it's owning user namespace. This means that PID 1 on the host can list all mounts in all mount namespaces or that a container can list all mounts of its nested containers. Optionally pass a structure for NS_MNT_GET_INFO with NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount namespace in one go. (1) and (2) can be implemented for other namespace types easily. Together with recent api additions this means one can iterate through all mounts in all mount namespaces without ever touching procfs. Here's a sample program list_all_mounts_everywhere.c: // SPDX-License-Identifier: GPL-2.0-or-later #define _GNU_SOURCE #include <asm/unistd.h> #include <assert.h> #include <errno.h> #include <fcntl.h> #include <getopt.h> #include <linux/stat.h> #include <sched.h> #include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> #include <sys/param.h> #include <sys/pidfd.h> #include <sys/stat.h> #include <sys/statfs.h> #define die_errno(format, ...) \ do { \ fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \ __LINE__, __func__, ##__VA_ARGS__); \ exit(EXIT_FAILURE); \ } while (0) /* Get the id for a mount namespace */ #define NS_GET_MNTNS_ID _IO(0xb7, 0x5) /* Get next mount namespace. */ struct mnt_ns_info { __u32 size; __u32 nr_mounts; __u64 mnt_ns_id; }; #define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ /* Get information about namespace. */ #define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info) /* Get next namespace. */ #define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info) /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info) #define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3) #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define __NR_listmount 458 #define __NR_statmount 457 /* * @Mask bits for statmount(2) */ #define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ #define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ #define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ struct statmount { __u32 size; /* Total size, including strings */ __u32 mnt_opts; __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; __u64 sb_magic; /* ..._SUPER_MAGIC */ __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ __u32 fs_type; /* [str] Filesystem type */ __u64 mnt_id; /* Unique ID of mount */ __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ __u32 mnt_parent_id_old; __u64 mnt_attr; /* MOUNT_ATTR_... */ __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ __u64 mnt_peer_group; /* ID of shared peer group */ __u64 mnt_master; /* Mount receives propagation from this ID */ __u64 propagate_from; /* Propagation from in current namespace */ __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ __u64 mnt_ns_id; __u64 __spare2[49]; char str[]; /* Variable size part containing strings */ }; struct mnt_id_req { __u32 size; __u32 spare; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; }; #define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ #define LSMT_ROOT 0xffffffffffffffff /* root mount */ static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask, struct statmount *stmnt, size_t bufsize, unsigned int flags) { struct mnt_id_req req = { .size = MNT_ID_REQ_SIZE_VER1, .mnt_id = mnt_id, .param = mask, .mnt_ns_id = mnt_ns_id, }; return syscall(__NR_statmount, &req, stmnt, bufsize, flags); } static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask, unsigned int flags) { size_t bufsize = 1 << 15; struct statmount *stmnt = NULL, *tmp = NULL; int ret; for (;;) { tmp = realloc(stmnt, bufsize); if (!tmp) goto out; stmnt = tmp; ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags); if (!ret) return stmnt; if (errno != EOVERFLOW) goto out; bufsize <<= 1; if (bufsize >= UINT_MAX / 2) goto out; } out: free(stmnt); printf("statmount failed"); return NULL; } static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id, __u64 list[], size_t num, unsigned int flags) { struct mnt_id_req req = { .size = MNT_ID_REQ_SIZE_VER1, .mnt_id = mnt_id, .param = last_mnt_id, .mnt_ns_id = mnt_ns_id, }; return syscall(__NR_listmount, &req, list, num, flags); } int main(int argc, char *argv[]) { #define LISTMNT_BUFFER 10 __u64 list[LISTMNT_BUFFER], last_mnt_id = 0; int ret, pidfd, fd_mntns; struct mnt_ns_info info = {}; pidfd = pidfd_open(getpid(), 0); if (pidfd < 0) die_errno("pidfd_open failed"); fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0); if (fd_mntns < 0) die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed"); ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info); if (ret < 0) die_errno("ioctl(NS_GET_MNTNS_ID) failed"); printf("Listing %u mounts for mount namespace %d:%llu\n", info.nr_mounts, fd_mntns, info.mnt_ns_id); for (;;) { ssize_t nr_mounts; next: nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id, info.mnt_ns_id, list, LISTMNT_BUFFER, 0); if (nr_mounts <= 0) { printf("Finished listing mounts for mount namespace %d:%llu\n\n", fd_mntns, info.mnt_ns_id); ret = ioctl(fd_mntns, NS_MNT_GET_NEXT, 0); if (ret < 0) die_errno("ioctl(NS_MNT_GET_NEXT) failed"); close(ret); ret = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info); if (ret < 0) { if (errno == ENOENT) { printf("Finished listing all mount namespaces\n"); exit(0); } die_errno("ioctl(NS_MNT_GET_NEXT) failed"); } close(fd_mntns); fd_mntns = ret; last_mnt_id = 0; printf("Listing %u mounts for mount namespace %d:%llu\n", info.nr_mounts, fd_mntns, info.mnt_ns_id); goto next; } for (size_t cur = 0; cur < nr_mounts; cur++) { struct statmount *stmnt; last_mnt_id = list[cur]; stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id, STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID | STATMOUNT_MNT_OPTS | STATMOUNT_FS_TYPE, 0); if (!stmnt) { printf("Failed to statmount(%llu) in mount namespace(%llu)\n", last_mnt_id, info.mnt_ns_id); continue; } printf("mnt_id(%u/%llu) | mnt_parent_id(%u/%llu): %s @ %s ==> %s with options: %s\n", stmnt->mnt_id_old, stmnt->mnt_id, stmnt->mnt_parent_id_old, stmnt->mnt_parent_id, stmnt->str + stmnt->fs_type, stmnt->str + stmnt->mnt_root, stmnt->str + stmnt->mnt_point, stmnt->str + stmnt->mnt_opts); free(stmnt); } } exit(0); } * patches from https://lore.kernel.org/r/20240719-work-mount-namespace-v1-0-834113cab0d2@kernel.org: nsfs: iterate through mount namespaces file: add fput() cleanup helper fs: add put_mnt_ns() cleanup helper fs: allow mount namespace fd Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 8400291 + a1d220d commit 49224a3

File tree

6 files changed

+198
-13
lines changed

6 files changed

+198
-13
lines changed

fs/mount.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,16 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
155155

156156
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
157157
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
158+
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
159+
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
160+
{
161+
return __lookup_next_mnt_ns(mntns, false);
162+
}
163+
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
164+
{
165+
return __lookup_next_mnt_ns(mntns, true);
166+
}
167+
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
168+
{
169+
return container_of(ns, struct mnt_namespace, ns);
170+
}

fs/namespace.c

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2060,14 +2060,41 @@ static bool is_mnt_ns_file(struct dentry *dentry)
20602060
dentry->d_fsdata == &mntns_operations;
20612061
}
20622062

2063-
static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
2063+
struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
20642064
{
2065-
return container_of(ns, struct mnt_namespace, ns);
2065+
return &mnt->ns;
20662066
}
20672067

2068-
struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
2068+
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
20692069
{
2070-
return &mnt->ns;
2070+
guard(read_lock)(&mnt_ns_tree_lock);
2071+
for (;;) {
2072+
struct rb_node *node;
2073+
2074+
if (previous)
2075+
node = rb_prev(&mntns->mnt_ns_tree_node);
2076+
else
2077+
node = rb_next(&mntns->mnt_ns_tree_node);
2078+
if (!node)
2079+
return ERR_PTR(-ENOENT);
2080+
2081+
mntns = node_to_mnt_ns(node);
2082+
node = &mntns->mnt_ns_tree_node;
2083+
2084+
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
2085+
continue;
2086+
2087+
/*
2088+
* Holding mnt_ns_tree_lock prevents the mount namespace from
2089+
* being freed but it may well be on it's deathbed. We want an
2090+
* active reference, not just a passive one here as we're
2091+
* persisting the mount namespace.
2092+
*/
2093+
if (!refcount_inc_not_zero(&mntns->ns.count))
2094+
continue;
2095+
2096+
return mntns;
2097+
}
20712098
}
20722099

20732100
static bool mnt_ns_loop(struct dentry *dentry)
@@ -5243,12 +5270,37 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
52435270
* that, or if not simply grab a passive reference on our mount namespace and
52445271
* return that.
52455272
*/
5246-
static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
5273+
static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
52475274
{
5248-
if (mnt_ns_id)
5249-
return lookup_mnt_ns(mnt_ns_id);
5250-
refcount_inc(&current->nsproxy->mnt_ns->passive);
5251-
return current->nsproxy->mnt_ns;
5275+
struct mnt_namespace *mnt_ns;
5276+
5277+
if (kreq->mnt_ns_id && kreq->spare)
5278+
return ERR_PTR(-EINVAL);
5279+
5280+
if (kreq->mnt_ns_id)
5281+
return lookup_mnt_ns(kreq->mnt_ns_id);
5282+
5283+
if (kreq->spare) {
5284+
struct ns_common *ns;
5285+
5286+
CLASS(fd, f)(kreq->spare);
5287+
if (!f.file)
5288+
return ERR_PTR(-EBADF);
5289+
5290+
if (!proc_ns_file(f.file))
5291+
return ERR_PTR(-EINVAL);
5292+
5293+
ns = get_proc_ns(file_inode(f.file));
5294+
if (ns->ops->type != CLONE_NEWNS)
5295+
return ERR_PTR(-EINVAL);
5296+
5297+
mnt_ns = to_mnt_ns(ns);
5298+
} else {
5299+
mnt_ns = current->nsproxy->mnt_ns;
5300+
}
5301+
5302+
refcount_inc(&mnt_ns->passive);
5303+
return mnt_ns;
52525304
}
52535305

52545306
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
@@ -5269,7 +5321,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
52695321
if (ret)
52705322
return ret;
52715323

5272-
ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
5324+
ns = grab_requested_mnt_ns(&kreq);
52735325
if (!ns)
52745326
return -ENOENT;
52755327

@@ -5396,7 +5448,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
53965448
if (!kmnt_ids)
53975449
return -ENOMEM;
53985450

5399-
ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
5451+
ns = grab_requested_mnt_ns(&kreq);
54005452
if (!ns)
54015453
return -ENOENT;
54025454

fs/nsfs.c

Lines changed: 100 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <linux/user_namespace.h>
1313
#include <linux/nsfs.h>
1414
#include <linux/uaccess.h>
15+
#include <linux/mnt_namespace.h>
1516

1617
#include "mount.h"
1718
#include "internal.h"
@@ -128,13 +129,39 @@ int open_related_ns(struct ns_common *ns,
128129
}
129130
EXPORT_SYMBOL_GPL(open_related_ns);
130131

132+
static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
133+
struct mnt_ns_info __user *uinfo, size_t usize,
134+
struct mnt_ns_info *kinfo)
135+
{
136+
/*
137+
* If userspace and the kernel have the same struct size it can just
138+
* be copied. If userspace provides an older struct, only the bits that
139+
* userspace knows about will be copied. If userspace provides a new
140+
* struct, only the bits that the kernel knows aobut will be copied and
141+
* the size value will be set to the size the kernel knows about.
142+
*/
143+
kinfo->size = min(usize, sizeof(*kinfo));
144+
kinfo->mnt_ns_id = mnt_ns->seq;
145+
kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
146+
/* Subtract the root mount of the mount namespace. */
147+
if (kinfo->nr_mounts)
148+
kinfo->nr_mounts--;
149+
150+
if (copy_to_user(uinfo, kinfo, kinfo->size))
151+
return -EFAULT;
152+
153+
return 0;
154+
}
155+
131156
static long ns_ioctl(struct file *filp, unsigned int ioctl,
132157
unsigned long arg)
133158
{
134159
struct user_namespace *user_ns;
135160
struct pid_namespace *pid_ns;
136161
struct task_struct *tsk;
137162
struct ns_common *ns = get_proc_ns(file_inode(filp));
163+
struct mnt_namespace *mnt_ns;
164+
bool previous = false;
138165
uid_t __user *argp;
139166
uid_t uid;
140167
int ret;
@@ -156,7 +183,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
156183
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
157184
return put_user(uid, argp);
158185
case NS_GET_MNTNS_ID: {
159-
struct mnt_namespace *mnt_ns;
160186
__u64 __user *idp;
161187
__u64 id;
162188

@@ -211,7 +237,79 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
211237

212238
if (!ret)
213239
ret = -ESRCH;
214-
break;
240+
return ret;
241+
}
242+
}
243+
244+
/* extensible ioctls */
245+
switch (_IOC_NR(ioctl)) {
246+
case _IOC_NR(NS_MNT_GET_INFO): {
247+
struct mnt_ns_info kinfo = {};
248+
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
249+
size_t usize = _IOC_SIZE(ioctl);
250+
251+
if (ns->ops->type != CLONE_NEWNS)
252+
return -EINVAL;
253+
254+
if (!uinfo)
255+
return -EINVAL;
256+
257+
if (usize < MNT_NS_INFO_SIZE_VER0)
258+
return -EINVAL;
259+
260+
return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
261+
}
262+
case _IOC_NR(NS_MNT_GET_PREV):
263+
previous = true;
264+
fallthrough;
265+
case _IOC_NR(NS_MNT_GET_NEXT): {
266+
struct mnt_ns_info kinfo = {};
267+
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
268+
struct path path __free(path_put) = {};
269+
struct file *f __free(fput) = NULL;
270+
size_t usize = _IOC_SIZE(ioctl);
271+
272+
if (ns->ops->type != CLONE_NEWNS)
273+
return -EINVAL;
274+
275+
if (usize < MNT_NS_INFO_SIZE_VER0)
276+
return -EINVAL;
277+
278+
if (previous)
279+
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
280+
else
281+
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
282+
if (IS_ERR(mnt_ns))
283+
return PTR_ERR(mnt_ns);
284+
285+
ns = to_ns_common(mnt_ns);
286+
/* Transfer ownership of @mnt_ns reference to @path. */
287+
ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
288+
if (ret)
289+
return ret;
290+
291+
CLASS(get_unused_fd, fd)(O_CLOEXEC);
292+
if (fd < 0)
293+
return fd;
294+
295+
f = dentry_open(&path, O_RDONLY, current_cred());
296+
if (IS_ERR(f))
297+
return PTR_ERR(f);
298+
299+
if (uinfo) {
300+
/*
301+
* If @uinfo is passed return all information about the
302+
* mount namespace as well.
303+
*/
304+
ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
305+
if (ret)
306+
return ret;
307+
}
308+
309+
/* Transfer reference of @f to caller's fdtable. */
310+
fd_install(fd, no_free_ptr(f));
311+
/* File descriptor is live so hand it off to the caller. */
312+
return take_fd(fd);
215313
}
216314
default:
217315
ret = -ENOTTY;

include/linux/file.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/posix_types.h>
1212
#include <linux/errno.h>
1313
#include <linux/cleanup.h>
14+
#include <linux/err.h>
1415

1516
struct file;
1617

@@ -96,6 +97,7 @@ extern void put_unused_fd(unsigned int fd);
9697

9798
DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
9899
get_unused_fd_flags(flags), unsigned flags)
100+
DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
99101

100102
/*
101103
* take_fd() will take care to set @fd to -EBADF ensuring that

include/linux/mnt_namespace.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
#define _NAMESPACE_H_
44
#ifdef __KERNEL__
55

6+
#include <linux/cleanup.h>
7+
#include <linux/err.h>
8+
69
struct mnt_namespace;
710
struct fs_struct;
811
struct user_namespace;
@@ -11,6 +14,7 @@ struct ns_common;
1114
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
1215
struct user_namespace *, struct fs_struct *);
1316
extern void put_mnt_ns(struct mnt_namespace *ns);
17+
DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T))
1418
extern struct ns_common *from_mnt_ns(struct mnt_namespace *);
1519

1620
extern const struct file_operations proc_mounts_operations;

include/uapi/linux/nsfs.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#define __LINUX_NSFS_H
44

55
#include <linux/ioctl.h>
6+
#include <linux/types.h>
67

78
#define NSIO 0xb7
89

@@ -26,4 +27,19 @@
2627
/* Return thread-group leader id of pid in the target pid namespace. */
2728
#define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x9, int)
2829

30+
struct mnt_ns_info {
31+
__u32 size;
32+
__u32 nr_mounts;
33+
__u64 mnt_ns_id;
34+
};
35+
36+
#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
37+
38+
/* Get information about namespace. */
39+
#define NS_MNT_GET_INFO _IOR(NSIO, 10, struct mnt_ns_info)
40+
/* Get next namespace. */
41+
#define NS_MNT_GET_NEXT _IOR(NSIO, 11, struct mnt_ns_info)
42+
/* Get previous namespace. */
43+
#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info)
44+
2945
#endif /* __LINUX_NSFS_H */

0 commit comments

Comments
 (0)