Skip to content

Commit

Permalink
fix(cubestore): do not stop startup warmup on errors
Browse files Browse the repository at this point in the history
Simply log the errors and continue. Also logs errors as they appear,
previsouly CubeStore used to wait for program termination.

This can produce spurious errors in the logs due to missing files after
compaction finished. To avoid them, we must propagate 404 errors from
remote fs and log those in debug mode.
  • Loading branch information
ilya-biryukov committed May 12, 2021
1 parent 6c4b35c commit 90350a3
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 18 deletions.
36 changes: 21 additions & 15 deletions rust/cubestore/src/cluster/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pub mod worker_pool;
#[cfg(not(target_os = "windows"))]
use crate::cluster::worker_pool::{MessageProcessor, WorkerPool};

use crate::ack_error;
use crate::cluster::message::NetworkMessage;
use crate::cluster::transport::{ClusterTransport, MetaStoreTransport, WorkerConnection};
use crate::config::injection::DIService;
Expand Down Expand Up @@ -1123,25 +1124,28 @@ impl ClusterImpl {
///
/// Can take awhile, use the passed cancellation token to stop the worker before it finishes.
/// Designed to run in the background.
pub async fn warmup_select_worker(&self) -> Result<(), CubeError> {
pub async fn warmup_select_worker(&self) {
if self.config_obj.select_workers().len() == 0 {
return Err(CubeError::internal(
"no select workers specified".to_owned(),
));
log::error!("No select workers specified");
return;
}
if !self.config_obj.select_workers().contains(&self.server_name) {
return Err(CubeError::internal(
"current node is not a select worker".to_owned(),
));
log::error!("Current node is not a select worker");
return;
}

if !self.config_obj.enable_startup_warmup() {
log::info!("Startup warmup disabled");
return Ok(());
return;
}

log::debug!("Requesting partitions for startup warmup");
let partitions = self.meta_store.get_warmup_partitions().await?;
let partitions = match self.meta_store.get_warmup_partitions().await {
Ok(p) => p,
Err(e) => {
log::error!("Failed to get warmup partitions: {}", e);
return;
}
};
log::debug!("Got {} partitions, running the warmup", partitions.len());

for (p, chunks) in partitions {
Expand All @@ -1151,20 +1155,22 @@ impl ClusterImpl {
if let Some(file) = partition_file_name(p.parent_partition_id, p.partition_id) {
if self.stop_token.is_cancelled() {
log::debug!("Startup warmup cancelled");
return Ok(());
return;
}
self.remote_fs.download_file(&file).await?;
// TODO: propagate 'not found' and log in debug mode. Compaction might remove files,
// so they are not errors most of the time.
ack_error!(self.remote_fs.download_file(&file).await);
}
for c in chunks {
if self.stop_token.is_cancelled() {
log::debug!("Startup warmup cancelled");
return Ok(());
return;
}
self.remote_fs.download_file(&chunk_file_name(c)).await?;
ack_error!(self.remote_fs.download_file(&chunk_file_name(c)).await);
}
}
log::debug!("Startup warmup finished");
return Ok(());
return;
}
}

Expand Down
7 changes: 4 additions & 3 deletions rust/cubestore/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,10 @@ impl CubeServices {
started_rx.await?;

let cluster = self.cluster.clone();
futures.push(tokio::spawn(
async move { cluster.warmup_select_worker().await },
))
futures.push(tokio::spawn(async move {
cluster.warmup_select_worker().await;
Ok(())
}))
}
futures.push(tokio::spawn(async move {
start_track_event_loop().await;
Expand Down
9 changes: 9 additions & 0 deletions rust/cubestore/src/util/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/// Use a macro to keep the call site information (file, line number) in the log message.
#[macro_export]
macro_rules! ack_error {
($x:expr) => {
if let std::result::Result::Err(e) = $x {
log::error!("Error: {:?}", e);
}
};
}
2 changes: 2 additions & 0 deletions rust/cubestore/src/util/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
pub mod error;
pub mod lock;
mod malloc_trim_loop;
pub mod maybe_owned;
pub mod ordfloat;
pub mod time_span;

pub use malloc_trim_loop::spawn_malloc_trim_loop;

use crate::CubeError;
Expand Down

0 comments on commit 90350a3

Please sign in to comment.