Skip to content

Commit

Permalink
main: fix high latency generated by file handle creation
Browse files Browse the repository at this point in the history
Whenever the file descriptor table is full, Linux expands it by
doubling it's size.
The filesystem code that does this uses RCU synchronization to ensure
all pre-existing RCU read-side critical sections have completed. The
latency induced by this synchronization is a big part of the total
time required to restore a snapshot.
The kernel has an optimization in the expand table code path where it
doesn't call synchronize_rcu() if there is only one thread in the
process.
To address this issue we attempt to expand the descriptor table at the
application start, when it has only one thread.

Signed-off-by: Alexandru Matei <alexandru.matei@uipath.com>
  • Loading branch information
alex-matei committed May 23, 2024
1 parent 5a811af commit 37b9ba7
Showing 1 changed file with 105 additions and 2 deletions.
107 changes: 105 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ use log::{warn, LevelFilter};
use option_parser::OptionParser;
use seccompiler::SeccompAction;
use signal_hook::consts::SIGSYS;
use std::env;
use std::fs::File;
use std::os::unix::io::{FromRawFd, RawFd};
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
use std::sync::mpsc::channel;
use std::sync::{Arc, Mutex};
use std::{env, io};
use thiserror::Error;
#[cfg(feature = "dbus_api")]
use vmm::api::dbus::{dbus_api_graceful_shutdown, DBusApiOptions};
Expand Down Expand Up @@ -87,6 +87,20 @@ enum Error {
HttpApiShutdown(#[source] vmm::Error),
}

#[derive(Error, Debug)]
enum FdTableError {
#[error("Failed to create event fd: {0}")]
CreateEventFd(std::io::Error),
#[error("Failed to obtain file limit: {0}")]
GetRLimit(std::io::Error),
#[error("Error calling fcntl with F_GETFD: {0}")]
GetFd(std::io::Error),
#[error("Failed to close file handle: {0}")]
Close(std::io::Error),
#[error("Failed to duplicate file handle: {0}")]
Dup2(std::io::Error),
}

struct Logger {
output: Mutex<Box<dyn std::io::Write + Send>>,
start: std::time::Instant,
Expand Down Expand Up @@ -782,6 +796,85 @@ fn start_vmm(cmd_arguments: ArgMatches) -> Result<Option<String>, Error> {
r.map(|_| api_socket_path)
}

// This is a best-effort solution to the latency induced by the RCU
// synchronization that happens in the kernel whenever the file
// descriptor table is full and needs to be expanded. The initial table
// has 64 entries on amd64.
// Rust programs that use {File,EventFd}::try_clone() to share handles
// are impacted by this issue.
// This behavior is quite noticeable in the restore snapshot scenario,
// the latency is a big chunk of the total time required to start
// cloud-hypervisor and to restore the snapshot.
// The kernel has an optimization in the expand table code path where it
// doesn't call synchronize_rcu() if there is only one thread in the
// process.
// To address this issue we attempt to expand the descriptor table at
// application start, when it has only one thread.
fn expand_fdtable() -> Result<(), FdTableError> {
let mut limits = libc::rlimit {
rlim_cur: 0,
rlim_max: 0,
};

// SAFETY: FFI call with valid arguments
if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut limits) } < 0 {
return Err(FdTableError::GetRLimit(io::Error::last_os_error()));
}

let dummy_evt = EventFd::new(EFD_NONBLOCK).map_err(FdTableError::CreateEventFd)?;

let mut table_size: i32;

if limits.rlim_cur < 4096 {
table_size = limits.rlim_cur as i32;

let new_limits = libc::rlimit {
rlim_cur: std::cmp::min(4096, limits.rlim_max),
rlim_max: limits.rlim_max,
};

// If this fails we are using the current rlim_cur which is lower
// than 4096.
// SAFETY: FFI call with valid arguments
if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &new_limits) } == 0 {
table_size = new_limits.rlim_cur as i32;
}
} else {
table_size = 4096;
}

// The first 3 handles are stdin, stdout, stderr. We don't want to touch
// any of them. If table_size is <= 3 it means we either didn't manage to set
// the soft limit to 4096 and we use the current soft limit or hard limit <= 3.
// Either way there is nothing we can possibly do in this case.
if table_size <= 3 {
return Ok(());
}

// Test if the file descriptor is empty
// SAFETY: FFI call with valid arguments
let flags: i32 = unsafe { libc::fcntl(table_size - 1, libc::F_GETFD) };
if flags >= 0 {
// Nothing to do, the table is already big enough
return Ok(());
}

let err = io::Error::last_os_error();
if err.raw_os_error() != Some(libc::EBADF) {
return Err(FdTableError::GetFd(err));
}
// SAFETY: FFI call with valid arguments
if unsafe { libc::dup2(dummy_evt.as_raw_fd(), table_size - 1) } < 0 {
return Err(FdTableError::Dup2(io::Error::last_os_error()));
}
// SAFETY: FFI call, trivially safe
if unsafe { libc::close(table_size - 1) } != 0 {
return Err(FdTableError::Close(io::Error::last_os_error()));
}

Ok(())
}

fn main() {
#[cfg(all(feature = "tdx", feature = "sev_snp"))]
compile_error!("Feature 'tdx' and 'sev_snp' are mutually exclusive.");
Expand All @@ -808,6 +901,16 @@ fn main() {
return;
}

if let Err(e) = expand_fdtable() {
match e {
FdTableError::Close(_) => {
eprintln!("Error expanding FD table: {e}");
std::process::exit(1);
}
_ => warn!("Error expanding FD table: {e}"),
}
}

let exit_code = match start_vmm(cmd_arguments) {
Ok(path) => {
path.map(|s| std::fs::remove_file(s).ok());
Expand Down

0 comments on commit 37b9ba7

Please sign in to comment.