Skip to content

Commit

Permalink
feat: Increase backpressure threshold to 5 seconds (#340)
Browse files Browse the repository at this point in the history
During inc-626 we saw excessive backpressure which caused the consumer to be paused and resumed a lot. This led to excessive network traffic as the local queue was purged and messages were re-downloaded. Increase the threshold for pausing to reduce the impact of this.
  • Loading branch information
lynnagara committed Feb 23, 2024
1 parent 51addf8 commit 4752f58
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 7 deletions.
5 changes: 3 additions & 2 deletions arroyo/processing/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
logger = logging.getLogger(__name__)

METRICS_FREQUENCY_SEC = 1.0 # In seconds
BACKPRESSURE_THRESHOLD = 5.0 # In seconds

F = TypeVar("F", bound=Callable[[Any], Any])

Expand Down Expand Up @@ -150,7 +151,7 @@ def __init__(

# The timestamp when backpressure state started
self.__backpressure_timestamp: Optional[float] = None
# Consumer is paused after it is in backpressure state for > 1 second
# Consumer is paused after it is in backpressure state for > BACKPRESSURE_THRESHOLD seconds
self.__is_paused = False

self.__commit_policy_state = commit_policy.get_state_machine()
Expand Down Expand Up @@ -421,7 +422,7 @@ def _run_once(self) -> None:
self.__backpressure_timestamp = time.time()

elif not self.__is_paused and (
time.time() - self.__backpressure_timestamp > 1
time.time() - self.__backpressure_timestamp > BACKPRESSURE_THRESHOLD
):
self.__metrics_buffer.incr_counter("arroyo.consumer.pause", 1)
logger.debug(
Expand Down
6 changes: 3 additions & 3 deletions rust-arroyo/src/processing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pub enum RunError {
Strategy(#[source] Box<dyn std::error::Error>),
}

const BACKPRESSURE_THRESHOLD: Duration = Duration::from_secs(1);
const BACKPRESSURE_THRESHOLD: Duration = Duration::from_secs(5);

#[derive(Clone)]
pub struct ConsumerState<TPayload>(Arc<(AtomicBool, Mutex<ConsumerStateInner<TPayload>>)>);
Expand Down Expand Up @@ -362,12 +362,12 @@ impl<TPayload: Clone + Send + Sync + 'static> StreamProcessor<TPayload> {
return Ok(());
};

// If we are in the backpressure state for more than 1 second,
// If we are in the backpressure state for more than BACKPRESSURE_THRESHOLD seconds,
// we pause the consumer and hold the message until it is
// accepted, at which point we can resume consuming.
if !consumer_is_paused && deadline.has_elapsed() {
tracing::warn!(
"Consumer is in backpressure state for more than 1 second, pausing",
"Consumer is in backpressure state for more than 5 seconds, pausing",
);

let partitions = self.consumer.tell().unwrap().into_keys().collect();
Expand Down
6 changes: 4 additions & 2 deletions tests/processing/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,10 @@ def test_stream_processor_lifecycle() -> None:
with assert_changes(lambda: int(consumer.pause.call_count), 0, 1):
processor._run_once()
assert strategy.submit.call_args_list[-1] == mock.call(message)
time.sleep(1)
processor._run_once() # Should pause now

with mock.patch("time.time", return_value=time.time() + 5):
# time.sleep(5)
processor._run_once() # Should pause now

# Once the message is accepted by the processing strategy, the consumer
# should be resumed.
Expand Down

0 comments on commit 4752f58

Please sign in to comment.