Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 181 additions & 29 deletions bubblewrap.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,14 @@ uid_t opt_sandbox_uid = -1;
gid_t opt_sandbox_gid = -1;
int opt_sync_fd = -1;
int opt_block_fd = -1;
int opt_userns_block_fd = -1;
int opt_info_fd = -1;
int opt_seccomp_fd = -1;
char *opt_sandbox_hostname = NULL;

#define CAP_TO_MASK_0(x) (1L << ((x) & 31))
#define CAP_TO_MASK_1(x) CAP_TO_MASK_0(x - 32)

typedef enum {
SETUP_BIND_MOUNT,
SETUP_RO_BIND_MOUNT,
Expand Down Expand Up @@ -217,10 +221,13 @@ usage (int ecode, FILE *out)
" --symlink SRC DEST Create symlink at DEST with target SRC\n"
" --seccomp FD Load and use seccomp rules from FD\n"
" --block-fd FD Block on FD until some data to read is available\n"
" --userns-block-fd FD Block on FD until the user namespace is ready\n"
" --info-fd FD Write information about the running container to FD\n"
" --new-session Create a new terminal session\n"
" --die-with-parent Kills with SIGKILL child process (COMMAND) when bwrap or bwrap's parent dies.\n"
" --as-pid-1 Do not install a reaper process with PID=1\n"
" --cap-add CAP Add cap CAP when running as privileged user\n"
" --cap-drop CAP Drop cap CAP when running as privileged user\n"
);
exit (ecode);
}
Expand Down Expand Up @@ -450,8 +457,13 @@ do_init (int event_fd, pid_t initial_pid, struct sock_fprog *seccomp_prog)
return initial_exit_status;
}

#define CAP_TO_MASK_0(x) (1L << ((x) & 31))
#define CAP_TO_MASK_1(x) CAP_TO_MASK_0(x - 32)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIt: this looks like a dupe.


static uint32_t requested_caps[2] = {0, 0};

/* low 32bit caps needed */
#define REQUIRED_CAPS_0 (CAP_TO_MASK (CAP_SYS_ADMIN) | CAP_TO_MASK (CAP_SYS_CHROOT) | CAP_TO_MASK (CAP_NET_ADMIN) | CAP_TO_MASK (CAP_SETUID) | CAP_TO_MASK (CAP_SETGID))
#define REQUIRED_CAPS_0 (CAP_TO_MASK_0 (CAP_SYS_ADMIN) | CAP_TO_MASK_0 (CAP_SYS_CHROOT) | CAP_TO_MASK_0 (CAP_NET_ADMIN) | CAP_TO_MASK_0 (CAP_SETUID) | CAP_TO_MASK_0 (CAP_SETGID))
/* high 32bit caps needed */
#define REQUIRED_CAPS_1 0

Expand All @@ -473,11 +485,21 @@ set_required_caps (void)
}

static void
drop_all_caps (void)
drop_all_caps (bool keep_requested_caps)
{
struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 };
struct __user_cap_data_struct data[2] = { { 0 } };

if (keep_requested_caps)
{
data[0].effective = requested_caps[0];
data[0].permitted = requested_caps[0];
data[0].inheritable = requested_caps[0];
data[1].effective = requested_caps[1];
data[1].permitted = requested_caps[1];
data[1].inheritable = requested_caps[1];
}

if (capset (&hdr, data) < 0)
die_with_error ("capset failed");
}
Expand All @@ -494,8 +516,12 @@ has_caps (void)
return data[0].permitted != 0 || data[1].permitted != 0;
}

/* Most of the code here is used both to add caps to the ambient capabilities
* and drop caps from the bounding set. Handle both cases here and add
* drop_cap_bounding_set/set_ambient_capabilities wrappers to facilitate its usage.
*/
static void
drop_cap_bounding_set (void)
prctl_caps (uint32_t *caps, bool do_cap_bounding, bool do_set_ambient)
{
unsigned long cap;

Expand All @@ -506,14 +532,56 @@ drop_cap_bounding_set (void)
* https://github.com/projectatomic/bubblewrap/pull/175#issuecomment-278051373
* https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=160da84dbb39443fdade7151bc63a88f8e953077
*/
for (cap = 0; cap <= 63; cap++)
for (cap = 0; cap <= CAP_LAST_CAP; cap++)
{
bool keep = FALSE;
if (cap < 32)
{
if (CAP_TO_MASK_0 (cap) & caps[0])
keep = TRUE;
}
else
{
if (CAP_TO_MASK_1 (cap) & caps[1])
keep = TRUE;
}

if (keep && do_set_ambient)
{
int res = prctl (PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0);
if (res == -1 && !(errno == EINVAL || errno == EPERM))
die_with_error ("Adding ambient capability %ld", cap);
}

if (!keep && do_cap_bounding)
{
int res = prctl (PR_CAPBSET_DROP, cap, 0, 0, 0);
if (res == -1 && !(errno == EINVAL || errno == EPERM))
die_with_error ("Dropping capability %ld from bounds", cap);
}
}
}

static void
drop_cap_bounding_set (bool drop_all)
{
if (!drop_all)
prctl_caps (requested_caps, TRUE, FALSE);
else
{
int res = prctl (PR_CAPBSET_DROP, cap, 0, 0, 0);
if (res == -1 && !(errno == EINVAL || errno == EPERM))
die_with_error ("Dropping capability %ld from bounds", cap);
uint32_t no_caps[2] = {0, 0};
prctl_caps (no_caps, TRUE, FALSE);
}
}

static void
set_ambient_capabilities (void)
{
if (is_privileged)
return;
prctl_caps (requested_caps, FALSE, TRUE);
}

/* This acquires the privileges that the bwrap will need it to work.
* If bwrap is not setuid, then this does nothing, and it relies on
* unprivileged user namespaces to be used. This case is
Expand Down Expand Up @@ -562,8 +630,8 @@ acquire_privs (void)
if (new_fsuid != real_uid)
die ("Unable to set fsuid (was %d)", (int)new_fsuid);

/* We never need capabilies after execve(), so lets drop everything from the bounding set */
drop_cap_bounding_set ();
/* We never need capabilities after execve(), so lets drop everything from the bounding set */
drop_cap_bounding_set (TRUE);

/* Keep only the required capabilities for setup */
set_required_caps ();
Expand All @@ -585,7 +653,7 @@ switch_to_user_with_privs (void)
{
/* If we're in a new user namespace, we got back the bounding set, clear it again */
if (opt_unshare_user)
drop_cap_bounding_set ();
drop_cap_bounding_set (FALSE);

if (!is_privileged)
return;
Expand All @@ -602,16 +670,13 @@ switch_to_user_with_privs (void)
}

static void
drop_privs (void)
drop_privs (bool keep_requested_caps)
{
if (!is_privileged)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't tested it, though I suspect that this will cause https://pagure.io/releng/issue/6550 to regress; systemd-nspawn blocks capset(), but with this check removed, we'll always be calling it. (Note that #122 originally worked around this by only setting caps in the setuid case).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also what's causing coreos/rpm-ostree#855; we're now dropping CAP_DAC_OVERRIDE, so when rpm-ostree runs %pre scripts that do e.g. groupadd, they get EPERM on /etc/gshadow.

return;

/* Drop root uid */
if (setuid (opt_sandbox_uid) < 0)
if (getuid () == 0 && setuid (opt_sandbox_uid) < 0)
die_with_error ("unable to drop root uid");

drop_all_caps ();
drop_all_caps (keep_requested_caps);
}

static char *
Expand Down Expand Up @@ -1541,6 +1606,23 @@ parse_args_recurse (int *argcp,

opt_block_fd = the_fd;

argv += 1;
argc -= 1;
}
else if (strcmp (arg, "--userns-block-fd") == 0)
{
int the_fd;
char *endptr;

if (argc < 2)
die ("--userns-block-fd takes an argument");

the_fd = strtol (argv[1], &endptr, 10);
if (argv[1][0] == 0 || endptr[0] != 0 || the_fd < 0)
die ("Invalid fd: %s", argv[1]);

opt_userns_block_fd = the_fd;

argv += 1;
argc -= 1;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems this leaked into the wrong commit

Expand Down Expand Up @@ -1658,6 +1740,54 @@ parse_args_recurse (int *argcp,
{
opt_as_pid_1 = TRUE;
}
else if (strcmp (arg, "--cap-add") == 0)
{
cap_value_t cap;
if (argc < 2)
die ("--cap-add takes an argument");

if (strcasecmp (argv[1], "ALL") == 0)
{
requested_caps[0] = requested_caps[1] = 0xFFFFFFFF;
}
else
{
if (cap_from_name (argv[1], &cap) < 0)
die ("unknown cap: %s", argv[1]);

if (cap < 32)
requested_caps[0] |= CAP_TO_MASK_0 (cap);
else
requested_caps[1] |= CAP_TO_MASK_1 (cap - 32);
}

argv += 1;
argc -= 1;
}
else if (strcmp (arg, "--cap-drop") == 0)
{
cap_value_t cap;
if (argc < 2)
die ("--cap-drop takes an argument");

if (strcasecmp (argv[1], "ALL") == 0)
{
requested_caps[0] = requested_caps[1] = 0;
}
else
{
if (cap_from_name (argv[1], &cap) < 0)
die ("unknown cap: %s", argv[1]);

if (cap < 32)
requested_caps[0] &= ~CAP_TO_MASK_0 (cap);
else
requested_caps[1] &= ~CAP_TO_MASK_1 (cap - 32);
}

argv += 1;
argc -= 1;
}
else if (*arg == '-')
{
die ("Unknown option %s", arg);
Expand Down Expand Up @@ -1764,6 +1894,15 @@ main (int argc,

parse_args (&argc, &argv);

if ((requested_caps[0] || requested_caps[1]) && is_privileged)
die ("--cap-add in setuid mode can be used only by root");

if (opt_userns_block_fd != -1 && !opt_unshare_user)
die ("--userns-block-fd requires --unshare-user");

if (opt_userns_block_fd != -1 && opt_info_fd == -1)
die ("--userns-block-fd requires --info-fd");

/* We have to do this if we weren't installed setuid (and we're not
* root), so let's just DWIM */
if (!is_privileged && getuid () != 0)
Expand Down Expand Up @@ -1902,7 +2041,7 @@ main (int argc,
{
/* Parent, outside sandbox, privileged (initially) */

if (is_privileged && opt_unshare_user)
if (is_privileged && opt_unshare_user && opt_userns_block_fd == -1)
{
/* We're running as euid 0, but the uid we want to map is
* not 0. This means we're not allowed to write this from
Expand All @@ -1920,17 +2059,11 @@ main (int argc,
/* Initial launched process, wait for exec:ed command to exit */

/* We don't need any privileges in the launcher, drop them immediately. */
drop_privs ();
drop_privs (FALSE);

/* Optionally bind our lifecycle to that of the parent */
handle_die_with_parent ();

/* Let child run now that the uid maps are set up */
val = 1;
res = write (child_wait_fd, &val, 8);
/* Ignore res, if e.g. the child died and closed child_wait_fd we don't want to error out here */
close (child_wait_fd);

if (opt_info_fd != -1)
{
cleanup_free char *output = xasprintf ("{\n \"child-pid\": %i\n}\n", pid);
Expand All @@ -1940,6 +2073,19 @@ main (int argc,
close (opt_info_fd);
}

if (opt_userns_block_fd != -1)
{
char b[1];
(void) TEMP_FAILURE_RETRY (read (opt_userns_block_fd, b, 1));
close (opt_userns_block_fd);
}

/* Let child run now that the uid maps are set up */
val = 1;
res = write (child_wait_fd, &val, 8);
/* Ignore res, if e.g. the child died and closed child_wait_fd we don't want to error out here */
close (child_wait_fd);

monitor_child (event_fd, pid);
exit (0); /* Should not be reached, but better safe... */
}
Expand Down Expand Up @@ -1976,7 +2122,7 @@ main (int argc,

ns_uid = opt_sandbox_uid;
ns_gid = opt_sandbox_gid;
if (!is_privileged && opt_unshare_user)
if (!is_privileged && opt_unshare_user && opt_userns_block_fd == -1)
{
/* In the unprivileged case we have to write the uid/gid maps in
* the child, because we have no caps in the parent */
Expand Down Expand Up @@ -2050,7 +2196,7 @@ main (int argc,
if (child == 0)
{
/* Unprivileged setup process */
drop_privs ();
drop_privs (FALSE);
close (privsep_sockets[0]);
setup_newroot (opt_unshare_pid, privsep_sockets[1]);
exit (0);
Expand Down Expand Up @@ -2093,7 +2239,8 @@ main (int argc,
die_with_error ("unmount old root");

if (opt_unshare_user &&
(ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid))
(ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid) &&
opt_userns_block_fd == -1)
{
/* Now that devpts is mounted and we've no need for mount
permissions we can create a new userspace and map our uid
Expand All @@ -2115,8 +2262,8 @@ main (int argc,
if (chdir ("/") != 0)
die_with_error ("chdir /");

/* All privileged ops are done now, so drop it */
drop_privs ();
/* All privileged ops are done now, so drop caps we don't need */
drop_privs (!is_privileged);

if (opt_block_fd != -1)
{
Expand Down Expand Up @@ -2188,6 +2335,8 @@ main (int argc,

if (pid != 0)
{
drop_all_caps (FALSE);

/* Close fds in pid 1, except stdio and optionally event_fd
(for syncing pid 2 lifetime with monitor_child) and
opt_sync_fd (for syncing sandbox lifetime with outside
Expand Down Expand Up @@ -2227,6 +2376,9 @@ main (int argc,
/* Optionally bind our lifecycle */
handle_die_with_parent ();

if (!is_privileged)
set_ambient_capabilities ();

/* Should be the last thing before execve() so that filters don't
* need to handle anything above */
if (seccomp_data != NULL &&
Expand Down
Loading