diff --git a/bubblewrap.c b/bubblewrap.c
index ed94923f..5b1f8522 100644
--- a/bubblewrap.c
+++ b/bubblewrap.c
@@ -71,10 +71,14 @@ uid_t opt_sandbox_uid = -1;
gid_t opt_sandbox_gid = -1;
int opt_sync_fd = -1;
int opt_block_fd = -1;
+int opt_userns_block_fd = -1;
int opt_info_fd = -1;
int opt_seccomp_fd = -1;
char *opt_sandbox_hostname = NULL;
+#define CAP_TO_MASK_0(x) (1L << ((x) & 31))
+#define CAP_TO_MASK_1(x) CAP_TO_MASK_0(x - 32)
+
typedef enum {
SETUP_BIND_MOUNT,
SETUP_RO_BIND_MOUNT,
@@ -217,10 +221,13 @@ usage (int ecode, FILE *out)
" --symlink SRC DEST Create symlink at DEST with target SRC\n"
" --seccomp FD Load and use seccomp rules from FD\n"
" --block-fd FD Block on FD until some data to read is available\n"
+ " --userns-block-fd FD Block on FD until the user namespace is ready\n"
" --info-fd FD Write information about the running container to FD\n"
" --new-session Create a new terminal session\n"
" --die-with-parent Kills with SIGKILL child process (COMMAND) when bwrap or bwrap's parent dies.\n"
" --as-pid-1 Do not install a reaper process with PID=1\n"
+ " --cap-add CAP Add cap CAP when running as privileged user\n"
+ " --cap-drop CAP Drop cap CAP when running as privileged user\n"
);
exit (ecode);
}
@@ -450,8 +457,13 @@ do_init (int event_fd, pid_t initial_pid, struct sock_fprog *seccomp_prog)
return initial_exit_status;
}
+#define CAP_TO_MASK_0(x) (1L << ((x) & 31))
+#define CAP_TO_MASK_1(x) CAP_TO_MASK_0(x - 32)
+
+static uint32_t requested_caps[2] = {0, 0};
+
/* low 32bit caps needed */
-#define REQUIRED_CAPS_0 (CAP_TO_MASK (CAP_SYS_ADMIN) | CAP_TO_MASK (CAP_SYS_CHROOT) | CAP_TO_MASK (CAP_NET_ADMIN) | CAP_TO_MASK (CAP_SETUID) | CAP_TO_MASK (CAP_SETGID))
+#define REQUIRED_CAPS_0 (CAP_TO_MASK_0 (CAP_SYS_ADMIN) | CAP_TO_MASK_0 (CAP_SYS_CHROOT) | CAP_TO_MASK_0 (CAP_NET_ADMIN) | CAP_TO_MASK_0 (CAP_SETUID) | CAP_TO_MASK_0 (CAP_SETGID))
/* high 32bit caps needed */
#define REQUIRED_CAPS_1 0
@@ -473,11 +485,21 @@ set_required_caps (void)
}
static void
-drop_all_caps (void)
+drop_all_caps (bool keep_requested_caps)
{
struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 };
struct __user_cap_data_struct data[2] = { { 0 } };
+ if (keep_requested_caps)
+ {
+ data[0].effective = requested_caps[0];
+ data[0].permitted = requested_caps[0];
+ data[0].inheritable = requested_caps[0];
+ data[1].effective = requested_caps[1];
+ data[1].permitted = requested_caps[1];
+ data[1].inheritable = requested_caps[1];
+ }
+
if (capset (&hdr, data) < 0)
die_with_error ("capset failed");
}
@@ -494,8 +516,12 @@ has_caps (void)
return data[0].permitted != 0 || data[1].permitted != 0;
}
+/* Most of the code here is used both to add caps to the ambient capabilities
+ * and drop caps from the bounding set. Handle both cases here and add
+ * drop_cap_bounding_set/set_ambient_capabilities wrappers to facilitate its usage.
+ */
static void
-drop_cap_bounding_set (void)
+prctl_caps (uint32_t *caps, bool do_cap_bounding, bool do_set_ambient)
{
unsigned long cap;
@@ -506,14 +532,56 @@ drop_cap_bounding_set (void)
* https://github.com/projectatomic/bubblewrap/pull/175#issuecomment-278051373
* https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=160da84dbb39443fdade7151bc63a88f8e953077
*/
- for (cap = 0; cap <= 63; cap++)
+ for (cap = 0; cap <= CAP_LAST_CAP; cap++)
+ {
+ bool keep = FALSE;
+ if (cap < 32)
+ {
+ if (CAP_TO_MASK_0 (cap) & caps[0])
+ keep = TRUE;
+ }
+ else
+ {
+ if (CAP_TO_MASK_1 (cap) & caps[1])
+ keep = TRUE;
+ }
+
+ if (keep && do_set_ambient)
+ {
+ int res = prctl (PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0);
+ if (res == -1 && !(errno == EINVAL || errno == EPERM))
+ die_with_error ("Adding ambient capability %ld", cap);
+ }
+
+ if (!keep && do_cap_bounding)
+ {
+ int res = prctl (PR_CAPBSET_DROP, cap, 0, 0, 0);
+ if (res == -1 && !(errno == EINVAL || errno == EPERM))
+ die_with_error ("Dropping capability %ld from bounds", cap);
+ }
+ }
+}
+
+static void
+drop_cap_bounding_set (bool drop_all)
+{
+ if (!drop_all)
+ prctl_caps (requested_caps, TRUE, FALSE);
+ else
{
- int res = prctl (PR_CAPBSET_DROP, cap, 0, 0, 0);
- if (res == -1 && !(errno == EINVAL || errno == EPERM))
- die_with_error ("Dropping capability %ld from bounds", cap);
+ uint32_t no_caps[2] = {0, 0};
+ prctl_caps (no_caps, TRUE, FALSE);
}
}
+static void
+set_ambient_capabilities (void)
+{
+ if (is_privileged)
+ return;
+ prctl_caps (requested_caps, FALSE, TRUE);
+}
+
/* This acquires the privileges that the bwrap will need it to work.
* If bwrap is not setuid, then this does nothing, and it relies on
* unprivileged user namespaces to be used. This case is
@@ -562,8 +630,8 @@ acquire_privs (void)
if (new_fsuid != real_uid)
die ("Unable to set fsuid (was %d)", (int)new_fsuid);
- /* We never need capabilies after execve(), so lets drop everything from the bounding set */
- drop_cap_bounding_set ();
+ /* We never need capabilities after execve(), so lets drop everything from the bounding set */
+ drop_cap_bounding_set (TRUE);
/* Keep only the required capabilities for setup */
set_required_caps ();
@@ -585,7 +653,7 @@ switch_to_user_with_privs (void)
{
/* If we're in a new user namespace, we got back the bounding set, clear it again */
if (opt_unshare_user)
- drop_cap_bounding_set ();
+ drop_cap_bounding_set (FALSE);
if (!is_privileged)
return;
@@ -602,16 +670,13 @@ switch_to_user_with_privs (void)
}
static void
-drop_privs (void)
+drop_privs (bool keep_requested_caps)
{
- if (!is_privileged)
- return;
-
/* Drop root uid */
- if (setuid (opt_sandbox_uid) < 0)
+ if (getuid () == 0 && setuid (opt_sandbox_uid) < 0)
die_with_error ("unable to drop root uid");
- drop_all_caps ();
+ drop_all_caps (keep_requested_caps);
}
static char *
@@ -1541,6 +1606,23 @@ parse_args_recurse (int *argcp,
opt_block_fd = the_fd;
+ argv += 1;
+ argc -= 1;
+ }
+ else if (strcmp (arg, "--userns-block-fd") == 0)
+ {
+ int the_fd;
+ char *endptr;
+
+ if (argc < 2)
+ die ("--userns-block-fd takes an argument");
+
+ the_fd = strtol (argv[1], &endptr, 10);
+ if (argv[1][0] == 0 || endptr[0] != 0 || the_fd < 0)
+ die ("Invalid fd: %s", argv[1]);
+
+ opt_userns_block_fd = the_fd;
+
argv += 1;
argc -= 1;
}
@@ -1658,6 +1740,54 @@ parse_args_recurse (int *argcp,
{
opt_as_pid_1 = TRUE;
}
+ else if (strcmp (arg, "--cap-add") == 0)
+ {
+ cap_value_t cap;
+ if (argc < 2)
+ die ("--cap-add takes an argument");
+
+ if (strcasecmp (argv[1], "ALL") == 0)
+ {
+ requested_caps[0] = requested_caps[1] = 0xFFFFFFFF;
+ }
+ else
+ {
+ if (cap_from_name (argv[1], &cap) < 0)
+ die ("unknown cap: %s", argv[1]);
+
+ if (cap < 32)
+ requested_caps[0] |= CAP_TO_MASK_0 (cap);
+ else
+ requested_caps[1] |= CAP_TO_MASK_1 (cap - 32);
+ }
+
+ argv += 1;
+ argc -= 1;
+ }
+ else if (strcmp (arg, "--cap-drop") == 0)
+ {
+ cap_value_t cap;
+ if (argc < 2)
+ die ("--cap-drop takes an argument");
+
+ if (strcasecmp (argv[1], "ALL") == 0)
+ {
+ requested_caps[0] = requested_caps[1] = 0;
+ }
+ else
+ {
+ if (cap_from_name (argv[1], &cap) < 0)
+ die ("unknown cap: %s", argv[1]);
+
+ if (cap < 32)
+ requested_caps[0] &= ~CAP_TO_MASK_0 (cap);
+ else
+ requested_caps[1] &= ~CAP_TO_MASK_1 (cap - 32);
+ }
+
+ argv += 1;
+ argc -= 1;
+ }
else if (*arg == '-')
{
die ("Unknown option %s", arg);
@@ -1764,6 +1894,15 @@ main (int argc,
parse_args (&argc, &argv);
+ if ((requested_caps[0] || requested_caps[1]) && is_privileged)
+ die ("--cap-add in setuid mode can be used only by root");
+
+ if (opt_userns_block_fd != -1 && !opt_unshare_user)
+ die ("--userns-block-fd requires --unshare-user");
+
+ if (opt_userns_block_fd != -1 && opt_info_fd == -1)
+ die ("--userns-block-fd requires --info-fd");
+
/* We have to do this if we weren't installed setuid (and we're not
* root), so let's just DWIM */
if (!is_privileged && getuid () != 0)
@@ -1902,7 +2041,7 @@ main (int argc,
{
/* Parent, outside sandbox, privileged (initially) */
- if (is_privileged && opt_unshare_user)
+ if (is_privileged && opt_unshare_user && opt_userns_block_fd == -1)
{
/* We're running as euid 0, but the uid we want to map is
* not 0. This means we're not allowed to write this from
@@ -1920,17 +2059,11 @@ main (int argc,
/* Initial launched process, wait for exec:ed command to exit */
/* We don't need any privileges in the launcher, drop them immediately. */
- drop_privs ();
+ drop_privs (FALSE);
/* Optionally bind our lifecycle to that of the parent */
handle_die_with_parent ();
- /* Let child run now that the uid maps are set up */
- val = 1;
- res = write (child_wait_fd, &val, 8);
- /* Ignore res, if e.g. the child died and closed child_wait_fd we don't want to error out here */
- close (child_wait_fd);
-
if (opt_info_fd != -1)
{
cleanup_free char *output = xasprintf ("{\n \"child-pid\": %i\n}\n", pid);
@@ -1940,6 +2073,19 @@ main (int argc,
close (opt_info_fd);
}
+ if (opt_userns_block_fd != -1)
+ {
+ char b[1];
+ (void) TEMP_FAILURE_RETRY (read (opt_userns_block_fd, b, 1));
+ close (opt_userns_block_fd);
+ }
+
+ /* Let child run now that the uid maps are set up */
+ val = 1;
+ res = write (child_wait_fd, &val, 8);
+ /* Ignore res, if e.g. the child died and closed child_wait_fd we don't want to error out here */
+ close (child_wait_fd);
+
monitor_child (event_fd, pid);
exit (0); /* Should not be reached, but better safe... */
}
@@ -1976,7 +2122,7 @@ main (int argc,
ns_uid = opt_sandbox_uid;
ns_gid = opt_sandbox_gid;
- if (!is_privileged && opt_unshare_user)
+ if (!is_privileged && opt_unshare_user && opt_userns_block_fd == -1)
{
/* In the unprivileged case we have to write the uid/gid maps in
* the child, because we have no caps in the parent */
@@ -2050,7 +2196,7 @@ main (int argc,
if (child == 0)
{
/* Unprivileged setup process */
- drop_privs ();
+ drop_privs (FALSE);
close (privsep_sockets[0]);
setup_newroot (opt_unshare_pid, privsep_sockets[1]);
exit (0);
@@ -2093,7 +2239,8 @@ main (int argc,
die_with_error ("unmount old root");
if (opt_unshare_user &&
- (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid))
+ (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid) &&
+ opt_userns_block_fd == -1)
{
/* Now that devpts is mounted and we've no need for mount
permissions we can create a new userspace and map our uid
@@ -2115,8 +2262,8 @@ main (int argc,
if (chdir ("/") != 0)
die_with_error ("chdir /");
- /* All privileged ops are done now, so drop it */
- drop_privs ();
+ /* All privileged ops are done now, so drop caps we don't need */
+ drop_privs (!is_privileged);
if (opt_block_fd != -1)
{
@@ -2188,6 +2335,8 @@ main (int argc,
if (pid != 0)
{
+ drop_all_caps (FALSE);
+
/* Close fds in pid 1, except stdio and optionally event_fd
(for syncing pid 2 lifetime with monitor_child) and
opt_sync_fd (for syncing sandbox lifetime with outside
@@ -2227,6 +2376,9 @@ main (int argc,
/* Optionally bind our lifecycle */
handle_die_with_parent ();
+ if (!is_privileged)
+ set_ambient_capabilities ();
+
/* Should be the last thing before execve() so that filters don't
* need to handle anything above */
if (seccomp_data != NULL &&
diff --git a/bwrap.xml b/bwrap.xml
index e16d2598..67bdc50e 100644
--- a/bwrap.xml
+++ b/bwrap.xml
@@ -262,6 +262,14 @@
Block the sandbox on reading from FD until some data is available.
+
+
+
+ Do not initialize the user namespace but wait on FD until it is ready. This allow
+ external processes (like newuidmap/newgidmap) to setup the user namespace before it
+ is used by the sandbox process.
+
+
@@ -295,6 +303,25 @@
Do not create a process with PID=1 in the sandbox to reap child processes.
+
+
+
+ Add the specified capability when running as privileged user. It accepts
+ the special value ALL to add all the permitted caps.
+
+
+
+
+
+ Drop the specified capability when running as privileged user. It accepts
+ the special value ALL to drop all the caps.
+
+ By default no caps are left in the sandboxed process. The
+ and
+ options are processed in the order they are specified on the
+ command line. Please be careful to the order they are specified.
+
+
diff --git a/completions/bash/bwrap b/completions/bash/bwrap
index 63781649..d045bcb5 100644
--- a/completions/bash/bwrap
+++ b/completions/bash/bwrap
@@ -50,6 +50,9 @@ _bwrap() {
--seccomp
--symlink
--die-with-parent
+ --cap-add
+ --cap-drop
+ --userns-block-fd
"
if [[ "$cur" == -* ]]; then
diff --git a/configure.ac b/configure.ac
index 2203b22c..b96b5e2e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -87,6 +87,12 @@ CC_CHECK_FLAGS_APPEND([WARN_CFLAGS], [CFLAGS], [\
])
AC_SUBST(WARN_CFLAGS)
+AC_CHECK_LIB(cap, cap_from_text)
+
+if test "$ac_cv_lib_cap_cap_from_text" != "yes"; then
+ AC_MSG_ERROR([*** libcap requested but not found])
+fi
+
AC_ARG_WITH(priv-mode,
AS_HELP_STRING([--with-priv-mode=setuid/none],
[How to set privilege-raising during make install]),
diff --git a/demos/userns-block-fd.py b/demos/userns-block-fd.py
new file mode 100755
index 00000000..0677a0d0
--- /dev/null
+++ b/demos/userns-block-fd.py
@@ -0,0 +1,36 @@
+#!/bin/python
+
+import os, select, subprocess, json
+
+pipe_info = os.pipe()
+userns_block = os.pipe()
+
+pid = os.fork()
+
+if pid != 0:
+ os.close(pipe_info[1])
+ os.close(userns_block[0])
+
+ select.select([pipe_info[0]], [], [])
+
+ data = json.load(os.fdopen(pipe_info[0]))
+ child_pid = str(data['child-pid'])
+
+ subprocess.call(["newuidmap", child_pid, "0", str(os.getuid()), "1"])
+ subprocess.call(["newgidmap", child_pid, "0", str(os.getgid()), "1"])
+
+ os.write(userns_block[1], '1')
+else:
+ os.close(pipe_info[0])
+ os.close(userns_block[1])
+
+ args = ["bwrap",
+ "bwrap",
+ "--unshare-all",
+ "--unshare-user",
+ "--userns-block-fd", "%i" % userns_block[0],
+ "--info-fd", "%i" % pipe_info[1],
+ "--bind", "/", "/",
+ "cat", "/proc/self/uid_map"]
+
+ os.execl(*args)
diff --git a/tests/test-run.sh b/tests/test-run.sh
index e79d51ca..43169d79 100755
--- a/tests/test-run.sh
+++ b/tests/test-run.sh
@@ -70,7 +70,15 @@ for ALT in "" "--unshare-user-try" "--unshare-pid" "--unshare-user-try --unshar
$RUN $ALT --unshare-net --proc /proc --dev /dev true
# Unreadable file
echo -n "expect EPERM: "
- if $RUN $ALT --unshare-net --proc /proc --bind /etc/shadow /tmp/foo cat /etc/shadow; then
+
+ # Test caps when bwrap is not setuid
+ if ! test -u ${BWRAP}; then
+ CAP="--cap-add ALL"
+ else
+ CAP=""
+ fi
+
+ if $RUN $CAP $ALT --unshare-net --proc /proc --bind /etc/shadow /tmp/foo cat /etc/shadow; then
assert_not_reached Could read /etc/shadow
fi
# Unreadable dir
@@ -89,6 +97,12 @@ done
$RUN --unshare-pid --as-pid-1 --bind / / bash -c 'echo $$' > as_pid_1.txt
assert_file_has_content as_pid_1.txt "1"
+# Check that by default we have no caps left
+for OPT in "" "--unshare-user-try --as-pid-1" "--unshare-user-try" "--as-pid-1"; do
+ $RUN $OPT --unshare-pid getpcaps 1 2> /tmp/caps
+ grep -q ": =$" /tmp/caps
+done
+
# Test --die-with-parent
cat >lockf-n.py <