Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement --preserve-fds flag #177

Merged
merged 5 commits into from
Aug 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/commands/create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pub struct Create {
/// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal
#[clap(short, long)]
console_socket: Option<PathBuf>,
/// Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)
#[clap(long, default_value = "0")]
preserve_fds: i32,
/// name of the container instance to be started
pub container_id: String,
}
Expand All @@ -35,12 +38,14 @@ impl Create {
pid_file: Option<PathBuf>,
bundle: PathBuf,
console_socket: Option<PathBuf>,
preserve_fds: i32,
) -> Self {
Self {
pid_file,
bundle,
console_socket,
container_id,
preserve_fds: preserve_fds,
}
}
/// Starts a new container process
Expand All @@ -49,6 +54,7 @@ impl Create {
.with_pid_file(self.pid_file.as_ref())
.with_console_socket(self.console_socket.as_ref())
.with_root_path(root_path)
.with_preserved_fds(self.preserve_fds)
.as_init(&self.bundle)
.with_systemd(systemd_cgroup)
.build()
Expand Down
4 changes: 4 additions & 0 deletions src/commands/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ pub struct Run {
/// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal
#[clap(short, long)]
console_socket: Option<PathBuf>,
/// Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)
#[clap(long, default_value = "0")]
preserve_fds: i32,
/// name of the container instance to be started
pub container_id: String,
}
Expand All @@ -29,6 +32,7 @@ impl Run {
self.pid_file.clone(),
self.bundle.clone(),
self.console_socket.clone(),
self.preserve_fds,
)
.exec(root_path.clone(), systemd_cgroup)?;

Expand Down
18 changes: 18 additions & 0 deletions src/container/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ pub struct ContainerBuilder {
pub(super) pid_file: Option<PathBuf>,
/// Socket to communicate the file descriptor of the ptty
pub(super) console_socket: Option<PathBuf>,
/// File descriptors to be passed into the container process
pub(super) preserve_fds: i32,
}

/// Builder that can be used to configure the common properties of
Expand Down Expand Up @@ -51,6 +53,7 @@ impl ContainerBuilder {
syscall: LinuxSyscall,
pid_file: None,
console_socket: None,
preserve_fds: 0,
}
}

Expand Down Expand Up @@ -129,4 +132,19 @@ impl ContainerBuilder {
self.console_socket = path.map(|p| p.into());
self
}

/// Sets the console socket, which will be used to send the file descriptor
/// of the pseudoterminal
/// # Example
///
/// ```no_run
/// # use youki::container::builder::ContainerBuilder;
///
/// ContainerBuilder::new("74f1a4cb3801".to_owned())
/// .with_preserved_fds(5);
/// ```
pub fn with_preserved_fds(mut self, preserved_fds: i32) -> Self {
self.preserve_fds = preserved_fds;
self
}
}
139 changes: 10 additions & 129 deletions src/container/builder_impl.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
use std::{fs, io::Write, path::PathBuf};

use anyhow::{Context, Result};
use nix::{
sched,
unistd::{Gid, Uid},
};
use oci_spec::Spec;
use std::{fs, path::PathBuf};

use crate::{
capabilities, cgroups,
cgroups,
namespaces::Namespaces,
notify_socket::NotifyListener,
process::{child, fork, parent},
rootfs,
process::{child, fork, init, parent},
rootless::Rootless,
stdio::FileDescriptor,
syscall::{linux::LinuxSyscall, Syscall},
tty, utils,
syscall::linux::LinuxSyscall,
utils,
};

use super::{Container, ContainerStatus};
Expand Down Expand Up @@ -45,6 +38,8 @@ pub(super) struct ContainerBuilderImpl {
pub notify_path: PathBuf,
/// Container state
pub container: Option<Container>,
/// File descriptos preserved/passed to the container init process.
pub preserve_fds: i32,
}

impl ContainerBuilderImpl {
Expand All @@ -69,21 +64,22 @@ impl ContainerBuilderImpl {
// This init_args will be passed to the container init process,
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
let init_args = ContainerInitArgs {
let init_args = init::ContainerInitArgs {
init: self.init,
syscall: self.syscall.clone(),
spec: self.spec.clone(),
rootfs: self.rootfs.clone(),
console_socket: self.console_socket.clone(),
rootless: self.rootless.clone(),
notify_path: self.notify_path.clone(),
preserve_fds: self.preserve_fds,
child,
};

// We have to box up this closure to correctly pass to the init function
// of the new process.
let cb = Box::new(move || {
if let Err(error) = container_init(init_args) {
if let Err(error) = init::container_init(init_args) {
log::debug!("failed to run container_init: {:?}", error);
return -1;
}
Expand Down Expand Up @@ -118,118 +114,3 @@ impl ContainerBuilderImpl {
Ok(())
}
}

struct ContainerInitArgs {
/// Flag indicating if an init or a tenant container should be created
pub init: bool,
/// Interface to operating system primitives
pub syscall: LinuxSyscall,
/// OCI complient runtime spec
pub spec: Spec,
/// Root filesystem of the container
pub rootfs: PathBuf,
/// Socket to communicate the file descriptor of the ptty
pub console_socket: Option<FileDescriptor>,
/// Options for rootless containers
pub rootless: Option<Rootless>,
/// Path to the Unix Domain Socket to communicate container start
pub notify_path: PathBuf,
/// Pipe used to communicate with the child process
pub child: child::ChildProcess,
}

fn container_init(args: ContainerInitArgs) -> Result<()> {
let command = &args.syscall;
let spec = &args.spec;
let linux = &spec.linux.as_ref().context("no linux in spec")?;
let namespaces: Namespaces = linux.namespaces.clone().into();
// need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
let proc = &spec.process.as_ref().context("no process in spec")?;
let rootfs = &args.rootfs;
let mut child = args.child;

// if Out-of-memory score adjustment is set in specification. set the score
// value for the current process check
// https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more
// information
if let Some(ref resource) = linux.resources {
if let Some(oom_score_adj) = resource.oom_score_adj {
let mut f = fs::File::create("/proc/self/oom_score_adj")?;
f.write_all(oom_score_adj.to_string().as_bytes())?;
}
}

// if new user is specified in specification, this will be true and new
// namespace will be created, check
// https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
// information
if args.rootless.is_some() {
// child needs to be dumpable, otherwise the non root parent is not
// allowed to write the uid/gid maps
prctl::set_dumpable(true).unwrap();
child.request_identifier_mapping()?;
child.wait_for_mapping_ack()?;
prctl::set_dumpable(false).unwrap();
}

// set limits and namespaces to the process
for rlimit in proc.rlimits.iter() {
command.set_rlimit(rlimit).context("failed to set rlimit")?;
}

command
.set_id(Uid::from_raw(0), Gid::from_raw(0))
.context("failed to become root")?;

// set up tty if specified
if let Some(csocketfd) = args.console_socket {
tty::setup_console(&csocketfd)?;
}

// join existing namespaces
namespaces.apply_setns()?;

command.set_hostname(&spec.hostname.as_ref().context("no hostname in spec")?)?;

if proc.no_new_privileges {
let _ = prctl::set_no_new_privileges(true);
}

if args.init {
rootfs::prepare_rootfs(
&spec,
&rootfs,
namespaces
.clone_flags
.contains(sched::CloneFlags::CLONE_NEWUSER),
)
.with_context(|| "Failed to prepare rootfs")?;

// change the root of filesystem of the process to the rootfs
command
.pivot_rootfs(rootfs)
.with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;
}

command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?;
capabilities::reset_effective(command)?;
if let Some(caps) = &proc.capabilities {
capabilities::drop_privileges(&caps, command)?;
}

// notify parents that the init process is ready to execute the payload.
child.notify_parent()?;

// listing on the notify socket for container start command
notify_socket.wait_for_container_start()?;

let args: &Vec<String> = &proc.args;
let envs: &Vec<String> = &proc.env;
utils::do_exec(&args[0], args, envs)?;

// After do_exec is called, the process is replaced with the container
// payload through execvp, so it should never reach here.
unreachable!();
}
1 change: 1 addition & 0 deletions src/container/init_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ impl InitContainerBuilder {
rootless,
notify_path,
container: Some(container_state),
preserve_fds: self.base.preserve_fds,
};

builder_impl.create()?;
Expand Down
1 change: 1 addition & 0 deletions src/container/tenant_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ impl TenantContainerBuilder {
rootless,
notify_path: notify_path.clone(),
container: None,
preserve_fds: self.base.preserve_fds,
};

builder_impl.create()?;
Expand Down
Loading