diff --git a/bubblewrap.c b/bubblewrap.c index ed94923f..5b1f8522 100644 --- a/bubblewrap.c +++ b/bubblewrap.c @@ -71,10 +71,14 @@ uid_t opt_sandbox_uid = -1; gid_t opt_sandbox_gid = -1; int opt_sync_fd = -1; int opt_block_fd = -1; +int opt_userns_block_fd = -1; int opt_info_fd = -1; int opt_seccomp_fd = -1; char *opt_sandbox_hostname = NULL; +#define CAP_TO_MASK_0(x) (1L << ((x) & 31)) +#define CAP_TO_MASK_1(x) CAP_TO_MASK_0(x - 32) + typedef enum { SETUP_BIND_MOUNT, SETUP_RO_BIND_MOUNT, @@ -217,10 +221,13 @@ usage (int ecode, FILE *out) " --symlink SRC DEST Create symlink at DEST with target SRC\n" " --seccomp FD Load and use seccomp rules from FD\n" " --block-fd FD Block on FD until some data to read is available\n" + " --userns-block-fd FD Block on FD until the user namespace is ready\n" " --info-fd FD Write information about the running container to FD\n" " --new-session Create a new terminal session\n" " --die-with-parent Kills with SIGKILL child process (COMMAND) when bwrap or bwrap's parent dies.\n" " --as-pid-1 Do not install a reaper process with PID=1\n" + " --cap-add CAP Add cap CAP when running as privileged user\n" + " --cap-drop CAP Drop cap CAP when running as privileged user\n" ); exit (ecode); } @@ -450,8 +457,13 @@ do_init (int event_fd, pid_t initial_pid, struct sock_fprog *seccomp_prog) return initial_exit_status; } +#define CAP_TO_MASK_0(x) (1L << ((x) & 31)) +#define CAP_TO_MASK_1(x) CAP_TO_MASK_0(x - 32) + +static uint32_t requested_caps[2] = {0, 0}; + /* low 32bit caps needed */ -#define REQUIRED_CAPS_0 (CAP_TO_MASK (CAP_SYS_ADMIN) | CAP_TO_MASK (CAP_SYS_CHROOT) | CAP_TO_MASK (CAP_NET_ADMIN) | CAP_TO_MASK (CAP_SETUID) | CAP_TO_MASK (CAP_SETGID)) +#define REQUIRED_CAPS_0 (CAP_TO_MASK_0 (CAP_SYS_ADMIN) | CAP_TO_MASK_0 (CAP_SYS_CHROOT) | CAP_TO_MASK_0 (CAP_NET_ADMIN) | CAP_TO_MASK_0 (CAP_SETUID) | CAP_TO_MASK_0 (CAP_SETGID)) /* high 32bit caps needed */ #define REQUIRED_CAPS_1 0 @@ -473,11 +485,21 @@ set_required_caps (void) } static void -drop_all_caps (void) +drop_all_caps (bool keep_requested_caps) { struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 }; struct __user_cap_data_struct data[2] = { { 0 } }; + if (keep_requested_caps) + { + data[0].effective = requested_caps[0]; + data[0].permitted = requested_caps[0]; + data[0].inheritable = requested_caps[0]; + data[1].effective = requested_caps[1]; + data[1].permitted = requested_caps[1]; + data[1].inheritable = requested_caps[1]; + } + if (capset (&hdr, data) < 0) die_with_error ("capset failed"); } @@ -494,8 +516,12 @@ has_caps (void) return data[0].permitted != 0 || data[1].permitted != 0; } +/* Most of the code here is used both to add caps to the ambient capabilities + * and drop caps from the bounding set. Handle both cases here and add + * drop_cap_bounding_set/set_ambient_capabilities wrappers to facilitate its usage. + */ static void -drop_cap_bounding_set (void) +prctl_caps (uint32_t *caps, bool do_cap_bounding, bool do_set_ambient) { unsigned long cap; @@ -506,14 +532,56 @@ drop_cap_bounding_set (void) * https://github.com/projectatomic/bubblewrap/pull/175#issuecomment-278051373 * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=160da84dbb39443fdade7151bc63a88f8e953077 */ - for (cap = 0; cap <= 63; cap++) + for (cap = 0; cap <= CAP_LAST_CAP; cap++) + { + bool keep = FALSE; + if (cap < 32) + { + if (CAP_TO_MASK_0 (cap) & caps[0]) + keep = TRUE; + } + else + { + if (CAP_TO_MASK_1 (cap) & caps[1]) + keep = TRUE; + } + + if (keep && do_set_ambient) + { + int res = prctl (PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0); + if (res == -1 && !(errno == EINVAL || errno == EPERM)) + die_with_error ("Adding ambient capability %ld", cap); + } + + if (!keep && do_cap_bounding) + { + int res = prctl (PR_CAPBSET_DROP, cap, 0, 0, 0); + if (res == -1 && !(errno == EINVAL || errno == EPERM)) + die_with_error ("Dropping capability %ld from bounds", cap); + } + } +} + +static void +drop_cap_bounding_set (bool drop_all) +{ + if (!drop_all) + prctl_caps (requested_caps, TRUE, FALSE); + else { - int res = prctl (PR_CAPBSET_DROP, cap, 0, 0, 0); - if (res == -1 && !(errno == EINVAL || errno == EPERM)) - die_with_error ("Dropping capability %ld from bounds", cap); + uint32_t no_caps[2] = {0, 0}; + prctl_caps (no_caps, TRUE, FALSE); } } +static void +set_ambient_capabilities (void) +{ + if (is_privileged) + return; + prctl_caps (requested_caps, FALSE, TRUE); +} + /* This acquires the privileges that the bwrap will need it to work. * If bwrap is not setuid, then this does nothing, and it relies on * unprivileged user namespaces to be used. This case is @@ -562,8 +630,8 @@ acquire_privs (void) if (new_fsuid != real_uid) die ("Unable to set fsuid (was %d)", (int)new_fsuid); - /* We never need capabilies after execve(), so lets drop everything from the bounding set */ - drop_cap_bounding_set (); + /* We never need capabilities after execve(), so lets drop everything from the bounding set */ + drop_cap_bounding_set (TRUE); /* Keep only the required capabilities for setup */ set_required_caps (); @@ -585,7 +653,7 @@ switch_to_user_with_privs (void) { /* If we're in a new user namespace, we got back the bounding set, clear it again */ if (opt_unshare_user) - drop_cap_bounding_set (); + drop_cap_bounding_set (FALSE); if (!is_privileged) return; @@ -602,16 +670,13 @@ switch_to_user_with_privs (void) } static void -drop_privs (void) +drop_privs (bool keep_requested_caps) { - if (!is_privileged) - return; - /* Drop root uid */ - if (setuid (opt_sandbox_uid) < 0) + if (getuid () == 0 && setuid (opt_sandbox_uid) < 0) die_with_error ("unable to drop root uid"); - drop_all_caps (); + drop_all_caps (keep_requested_caps); } static char * @@ -1541,6 +1606,23 @@ parse_args_recurse (int *argcp, opt_block_fd = the_fd; + argv += 1; + argc -= 1; + } + else if (strcmp (arg, "--userns-block-fd") == 0) + { + int the_fd; + char *endptr; + + if (argc < 2) + die ("--userns-block-fd takes an argument"); + + the_fd = strtol (argv[1], &endptr, 10); + if (argv[1][0] == 0 || endptr[0] != 0 || the_fd < 0) + die ("Invalid fd: %s", argv[1]); + + opt_userns_block_fd = the_fd; + argv += 1; argc -= 1; } @@ -1658,6 +1740,54 @@ parse_args_recurse (int *argcp, { opt_as_pid_1 = TRUE; } + else if (strcmp (arg, "--cap-add") == 0) + { + cap_value_t cap; + if (argc < 2) + die ("--cap-add takes an argument"); + + if (strcasecmp (argv[1], "ALL") == 0) + { + requested_caps[0] = requested_caps[1] = 0xFFFFFFFF; + } + else + { + if (cap_from_name (argv[1], &cap) < 0) + die ("unknown cap: %s", argv[1]); + + if (cap < 32) + requested_caps[0] |= CAP_TO_MASK_0 (cap); + else + requested_caps[1] |= CAP_TO_MASK_1 (cap - 32); + } + + argv += 1; + argc -= 1; + } + else if (strcmp (arg, "--cap-drop") == 0) + { + cap_value_t cap; + if (argc < 2) + die ("--cap-drop takes an argument"); + + if (strcasecmp (argv[1], "ALL") == 0) + { + requested_caps[0] = requested_caps[1] = 0; + } + else + { + if (cap_from_name (argv[1], &cap) < 0) + die ("unknown cap: %s", argv[1]); + + if (cap < 32) + requested_caps[0] &= ~CAP_TO_MASK_0 (cap); + else + requested_caps[1] &= ~CAP_TO_MASK_1 (cap - 32); + } + + argv += 1; + argc -= 1; + } else if (*arg == '-') { die ("Unknown option %s", arg); @@ -1764,6 +1894,15 @@ main (int argc, parse_args (&argc, &argv); + if ((requested_caps[0] || requested_caps[1]) && is_privileged) + die ("--cap-add in setuid mode can be used only by root"); + + if (opt_userns_block_fd != -1 && !opt_unshare_user) + die ("--userns-block-fd requires --unshare-user"); + + if (opt_userns_block_fd != -1 && opt_info_fd == -1) + die ("--userns-block-fd requires --info-fd"); + /* We have to do this if we weren't installed setuid (and we're not * root), so let's just DWIM */ if (!is_privileged && getuid () != 0) @@ -1902,7 +2041,7 @@ main (int argc, { /* Parent, outside sandbox, privileged (initially) */ - if (is_privileged && opt_unshare_user) + if (is_privileged && opt_unshare_user && opt_userns_block_fd == -1) { /* We're running as euid 0, but the uid we want to map is * not 0. This means we're not allowed to write this from @@ -1920,17 +2059,11 @@ main (int argc, /* Initial launched process, wait for exec:ed command to exit */ /* We don't need any privileges in the launcher, drop them immediately. */ - drop_privs (); + drop_privs (FALSE); /* Optionally bind our lifecycle to that of the parent */ handle_die_with_parent (); - /* Let child run now that the uid maps are set up */ - val = 1; - res = write (child_wait_fd, &val, 8); - /* Ignore res, if e.g. the child died and closed child_wait_fd we don't want to error out here */ - close (child_wait_fd); - if (opt_info_fd != -1) { cleanup_free char *output = xasprintf ("{\n \"child-pid\": %i\n}\n", pid); @@ -1940,6 +2073,19 @@ main (int argc, close (opt_info_fd); } + if (opt_userns_block_fd != -1) + { + char b[1]; + (void) TEMP_FAILURE_RETRY (read (opt_userns_block_fd, b, 1)); + close (opt_userns_block_fd); + } + + /* Let child run now that the uid maps are set up */ + val = 1; + res = write (child_wait_fd, &val, 8); + /* Ignore res, if e.g. the child died and closed child_wait_fd we don't want to error out here */ + close (child_wait_fd); + monitor_child (event_fd, pid); exit (0); /* Should not be reached, but better safe... */ } @@ -1976,7 +2122,7 @@ main (int argc, ns_uid = opt_sandbox_uid; ns_gid = opt_sandbox_gid; - if (!is_privileged && opt_unshare_user) + if (!is_privileged && opt_unshare_user && opt_userns_block_fd == -1) { /* In the unprivileged case we have to write the uid/gid maps in * the child, because we have no caps in the parent */ @@ -2050,7 +2196,7 @@ main (int argc, if (child == 0) { /* Unprivileged setup process */ - drop_privs (); + drop_privs (FALSE); close (privsep_sockets[0]); setup_newroot (opt_unshare_pid, privsep_sockets[1]); exit (0); @@ -2093,7 +2239,8 @@ main (int argc, die_with_error ("unmount old root"); if (opt_unshare_user && - (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid)) + (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid) && + opt_userns_block_fd == -1) { /* Now that devpts is mounted and we've no need for mount permissions we can create a new userspace and map our uid @@ -2115,8 +2262,8 @@ main (int argc, if (chdir ("/") != 0) die_with_error ("chdir /"); - /* All privileged ops are done now, so drop it */ - drop_privs (); + /* All privileged ops are done now, so drop caps we don't need */ + drop_privs (!is_privileged); if (opt_block_fd != -1) { @@ -2188,6 +2335,8 @@ main (int argc, if (pid != 0) { + drop_all_caps (FALSE); + /* Close fds in pid 1, except stdio and optionally event_fd (for syncing pid 2 lifetime with monitor_child) and opt_sync_fd (for syncing sandbox lifetime with outside @@ -2227,6 +2376,9 @@ main (int argc, /* Optionally bind our lifecycle */ handle_die_with_parent (); + if (!is_privileged) + set_ambient_capabilities (); + /* Should be the last thing before execve() so that filters don't * need to handle anything above */ if (seccomp_data != NULL && diff --git a/bwrap.xml b/bwrap.xml index e16d2598..67bdc50e 100644 --- a/bwrap.xml +++ b/bwrap.xml @@ -262,6 +262,14 @@ Block the sandbox on reading from FD until some data is available. + + + + Do not initialize the user namespace but wait on FD until it is ready. This allow + external processes (like newuidmap/newgidmap) to setup the user namespace before it + is used by the sandbox process. + + @@ -295,6 +303,25 @@ Do not create a process with PID=1 in the sandbox to reap child processes. + + + + Add the specified capability when running as privileged user. It accepts + the special value ALL to add all the permitted caps. + + + + + + Drop the specified capability when running as privileged user. It accepts + the special value ALL to drop all the caps. + + By default no caps are left in the sandboxed process. The + and + options are processed in the order they are specified on the + command line. Please be careful to the order they are specified. + + diff --git a/completions/bash/bwrap b/completions/bash/bwrap index 63781649..d045bcb5 100644 --- a/completions/bash/bwrap +++ b/completions/bash/bwrap @@ -50,6 +50,9 @@ _bwrap() { --seccomp --symlink --die-with-parent + --cap-add + --cap-drop + --userns-block-fd " if [[ "$cur" == -* ]]; then diff --git a/configure.ac b/configure.ac index 2203b22c..b96b5e2e 100644 --- a/configure.ac +++ b/configure.ac @@ -87,6 +87,12 @@ CC_CHECK_FLAGS_APPEND([WARN_CFLAGS], [CFLAGS], [\ ]) AC_SUBST(WARN_CFLAGS) +AC_CHECK_LIB(cap, cap_from_text) + +if test "$ac_cv_lib_cap_cap_from_text" != "yes"; then + AC_MSG_ERROR([*** libcap requested but not found]) +fi + AC_ARG_WITH(priv-mode, AS_HELP_STRING([--with-priv-mode=setuid/none], [How to set privilege-raising during make install]), diff --git a/demos/userns-block-fd.py b/demos/userns-block-fd.py new file mode 100755 index 00000000..0677a0d0 --- /dev/null +++ b/demos/userns-block-fd.py @@ -0,0 +1,36 @@ +#!/bin/python + +import os, select, subprocess, json + +pipe_info = os.pipe() +userns_block = os.pipe() + +pid = os.fork() + +if pid != 0: + os.close(pipe_info[1]) + os.close(userns_block[0]) + + select.select([pipe_info[0]], [], []) + + data = json.load(os.fdopen(pipe_info[0])) + child_pid = str(data['child-pid']) + + subprocess.call(["newuidmap", child_pid, "0", str(os.getuid()), "1"]) + subprocess.call(["newgidmap", child_pid, "0", str(os.getgid()), "1"]) + + os.write(userns_block[1], '1') +else: + os.close(pipe_info[0]) + os.close(userns_block[1]) + + args = ["bwrap", + "bwrap", + "--unshare-all", + "--unshare-user", + "--userns-block-fd", "%i" % userns_block[0], + "--info-fd", "%i" % pipe_info[1], + "--bind", "/", "/", + "cat", "/proc/self/uid_map"] + + os.execl(*args) diff --git a/tests/test-run.sh b/tests/test-run.sh index e79d51ca..43169d79 100755 --- a/tests/test-run.sh +++ b/tests/test-run.sh @@ -70,7 +70,15 @@ for ALT in "" "--unshare-user-try" "--unshare-pid" "--unshare-user-try --unshar $RUN $ALT --unshare-net --proc /proc --dev /dev true # Unreadable file echo -n "expect EPERM: " - if $RUN $ALT --unshare-net --proc /proc --bind /etc/shadow /tmp/foo cat /etc/shadow; then + + # Test caps when bwrap is not setuid + if ! test -u ${BWRAP}; then + CAP="--cap-add ALL" + else + CAP="" + fi + + if $RUN $CAP $ALT --unshare-net --proc /proc --bind /etc/shadow /tmp/foo cat /etc/shadow; then assert_not_reached Could read /etc/shadow fi # Unreadable dir @@ -89,6 +97,12 @@ done $RUN --unshare-pid --as-pid-1 --bind / / bash -c 'echo $$' > as_pid_1.txt assert_file_has_content as_pid_1.txt "1" +# Check that by default we have no caps left +for OPT in "" "--unshare-user-try --as-pid-1" "--unshare-user-try" "--as-pid-1"; do + $RUN $OPT --unshare-pid getpcaps 1 2> /tmp/caps + grep -q ": =$" /tmp/caps +done + # Test --die-with-parent cat >lockf-n.py <