Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ sentry = { version = "0.41.0", default-features = false, features = [
"tracing",
"logs"
] }
sentry_protos = "0.8.13"
sentry_protos = "0.10.0"
serde = "1.0.214"
serde_bytes = "0.11"
serde_yaml = "0.9.34"
Expand Down
6 changes: 5 additions & 1 deletion benches/store_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,11 @@ async fn set_status(num_activations: u32, num_workers: u32) {
for task_id in 0..num_activations {
if task_id % num_workers == worker_idx {
store
.set_status(&format!("id_{task_id}"), InflightActivationStatus::Complete)
.set_status(
&format!("id_{task_id}"),
InflightActivationStatus::Complete,
None,
)
.await
.unwrap();
}
Expand Down
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ readme = "README.md"
dependencies = [
"sentry-arroyo>=2.38.7",
"sentry-sdk[http2]>=2.43.0",
"sentry-protos>=0.8.13",
"sentry-protos>=0.10.0",
"confluent_kafka>=2.3.0",
"cronsim>=2.6",
"grpcio>=1.67.0",
Expand Down
1 change: 1 addition & 0 deletions clients/python/src/taskbroker_client/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ class ProcessingResult:
status: TaskActivationStatus.ValueType
host: str
receive_timestamp: float
max_attempts: int | None = None
2 changes: 2 additions & 0 deletions clients/python/src/taskbroker_client/worker/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ def update_task(
id=processing_result.task_id,
status=processing_result.status,
fetch_next_task=fetch_next_task,
max_attempts=processing_result.max_attempts,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have an example of how the task will define this value ?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)

try:
Expand Down Expand Up @@ -566,6 +567,7 @@ def update_task(
id=processing_result.task_id,
status=processing_result.status,
fetch_next_task=None,
max_attempts=processing_result.max_attempts,
)

retries = 0
Expand Down
8 changes: 8 additions & 0 deletions clients/python/src/taskbroker_client/worker/workerchild.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,14 @@ def handle_alarm(signum: int, frame: FrameType | None) -> None:
status=next_state,
host=inflight.host,
receive_timestamp=inflight.receive_timestamp,
# Send max_attempts if this is a retry. Don't send it
# on every task as this codepath is relatively
# unoptimized on the broker side.
max_attempts=(
task_func.retry._times + 1
if task_func.retry and next_state == TASK_ACTIVATION_STATUS_RETRY
else None
),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raw retries lose delay

Medium Severity

On retry, the worker only sends max_attempts to the broker. For raw-mode activations, retry_state was absent in the stored blob; set_status then inserts a minimal state with only max_attempts. Upkeep republish uses delay_on_retry from that blob, so configured retry backoff is dropped and retries run immediately.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 0bf9105. Configure here.

Copy link
Copy Markdown
Member Author

@untitaker untitaker May 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

damn, i thought we had a nonzero default value for this. will probably add delay_on_retry to SetTaskStatusRequest

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)
)

Expand Down
6 changes: 6 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ pub struct Config {
/// The location to the DLQ private key file
pub kafka_deadletter_ssl_key_location: Option<String>,

/// The topic to publish retry task activations to.
/// When set, retries go to this topic instead of kafka_topic.
/// Required for raw_mode where the main topic has other consumers.
pub kafka_retry_topic: Option<String>,

/// The default number of partitions for a topic
pub default_topic_partitions: i32,

Expand Down Expand Up @@ -371,6 +376,7 @@ impl Default for Config {
kafka_deadletter_ssl_ca_location: None,
kafka_deadletter_ssl_certificate_location: None,
kafka_deadletter_ssl_key_location: None,
kafka_retry_topic: None,
default_topic_partitions: 1,
kafka_session_timeout_ms: 6000,
kafka_auto_commit_interval_ms: 5000,
Expand Down
1 change: 1 addition & 0 deletions src/fetch/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ impl InflightActivationStore for MockStore {
&self,
_id: &str,
_status: InflightActivationStatus,
_max_attempts: Option<u32>,
) -> Result<Option<InflightActivation>, Error> {
unimplemented!()
}
Expand Down
11 changes: 9 additions & 2 deletions src/grpc/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,14 @@ impl ConsumerService for TaskbrokerServer {
metrics::counter!("grpc_server.set_status.failure").increment(1);
}

if let Some(ref tx) = self.update_tx {
let max_attempts = request.get_ref().max_attempts;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already asked in a different comment, but I think that code is now outdated so I'll ask here again. How often will there be a max_attempts field on the message? One out of every... 10? 100? 1000? If it's going to be present often, we'll need to rethink how batching works. Because batching is necessary to reach high throughput.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be present always in tasks that run out of standard topics. I'd go further and say that maybe we should simplify the system by moving all tasks to specify the retries via the set_status method so we do not maintain more than one implementation.

Why does this affect the way batch is implemented ?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The field will be there unconditionally on all retry status updates. So retries are effectively not batched. I can implement batching for retries but this would increase complexity.

Note that not every retry status update results in an additional DB query. max_attempts is only set in the DB if it wasn't there before.

This should be present always in tasks that run out of standard topics.

It's optional here for the sake of rollout. We can gradually increase the amount of tasks that send max_attempts through the worker and observe its impact on the broker. (the rollout mechanism isn't implemented here)


// Use batching channel if available and we don't need to update retry state.
// If max_attempts is Some, we can't use batching API to update the activation, and have to
// fall back to individual set_status.
if let Some(ref tx) = self.update_tx
&& max_attempts.is_none()
{
Comment on lines +111 to +116
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Taking a step back on the retry topic.
If the retry topic contains activations rather than the original message, why do we even need a topic ? can't we just store the activation in the DB as pending and treat it like a task for the rest of its lifetime ?

I recognize this is a departure from the original intent of this PR, but it seems a lot simpler to me to manage it this way. The idea of the topic, to me, was meant to use it as a DLQ as well. Am I missing something ?

Copy link
Copy Markdown
Member Author

@untitaker untitaker May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't we just store the activation in the DB as pending and treat it like a task for the rest of its lifetime ?

I am missing rationale for why the retry system was set up this way to begin with. Your suggestion also applies to how regular tasks work. I would not want to special-case raw-mode to handle retries fundamentally differently than regular tasks.

My guess is that we wanted to keep the size of a database under control, therefore pruning queued retries out of the DB and putting them back into Kafka. If we say that this is not really a concern with AlloyDB then that's fine, but we'd have to validate that IMO

I can explore this option in another PR, but not sure we should roll it out without having more context from folks who originally worked on taskbroker.

The idea of the topic, to me, was meant to use it as a DLQ as well

That is yet another topic. It can stay or go away regardless of what we decide wrt retries.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@markstory @enochtangg do you remember why we didn't "just" stick retries into the DB and produce them back into kafka?

Copy link
Copy Markdown

@fpacifici fpacifici May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your suggestion also applies to how regular tasks work.

I thought we picked up the task from the database to do the retry. @george-sentry did we change something in the push model ?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from slack:

@markstory: Keeping retries in sqlite/postgres could work but those retries will consume slots in sqlite/postgres. It will mean that a retry with no delay runs right away though instead of 'later' when it is found again in the topic.

@enochtangg: Another benefit I remember was because it resets the latency metric. Since latency is task dispatched - kafka receive latency, re-producing in kafka means we don't need to somehow fix that.

@fpacifici: the latency metric argument is important. I think we can keep it as it is and use the topic.  It is true that in AlloyDB there will be more room, but we will have the sqlite around for a while. No need to make the change

So I think I won't change anything here.

tx.send((id, status))
.await
.map_err(|_| Status::internal("Status update channel closed"))?;
Comment on lines +109 to 119
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The Python worker unconditionally sends max_attempts for tasks with retry policies. This forces the gRPC server to bypass its batching optimization, causing an N+1 database query issue.
Severity: HIGH

Suggested Fix

Modify the Python worker to only send max_attempts when a task is actually being retried. For completed or failed tasks, max_attempts should not be sent, allowing the gRPC server's max_attempts.is_none() check to pass and utilize the intended batching optimization.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent. Verify if this is a real issue. If it is, propose a fix; if not, explain why it's
not valid.

Location: src/grpc/server.rs#L106-L119

Potential issue: The Python worker unconditionally sends a `max_attempts` value for any
task with a retry decorator, regardless of its final status (complete, failure, or
retry). In the gRPC server, the batching channel (`update_tx`) is only used if
`max_attempts.is_none()`. Because the Python client always sends `max_attempts`, this
condition is never met for tasks with retry policies. This forces a fallback to an
individual `self.store.set_status()` call for each task, creating a significant
performance regression by introducing an N+1 database query problem instead of using a
single batched update.

Expand All @@ -115,7 +122,7 @@ impl ConsumerService for TaskbrokerServer {
return Ok(Response::new(SetTaskStatusResponse { task: None }));
}

match self.store.set_status(&id, status).await {
match self.store.set_status(&id, status, max_attempts).await {
Ok(Some(_)) => metrics::counter!(
"grpc_server.set_status",
"result" => "ok",
Expand Down
8 changes: 8 additions & 0 deletions src/grpc/server_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ async fn test_set_task_status(#[case] adapter: &str) {
id: "test_task".to_string(),
status: 5, // Complete
fetch_next_task: None,
max_attempts: None,
};

let response = service.set_task_status(Request::new(request)).await;
Expand Down Expand Up @@ -113,6 +114,7 @@ async fn test_set_task_status_invalid(#[case] adapter: &str) {
id: "test_task".to_string(),
status: 1, // Invalid
fetch_next_task: None,
max_attempts: None,
};

let response = service.set_task_status(Request::new(request)).await;
Expand Down Expand Up @@ -266,6 +268,7 @@ async fn test_set_task_status_success(#[case] adapter: &str) {
namespace: None,
application: None,
}),
max_attempts: None,
};
let response = service.set_task_status(Request::new(request)).await;
assert!(response.is_ok());
Expand Down Expand Up @@ -306,6 +309,7 @@ async fn test_set_task_status_with_application(#[case] adapter: &str) {
application: Some("hammers".into()),
namespace: None,
}),
max_attempts: None,
};

let response = service.set_task_status(Request::new(request)).await;
Expand Down Expand Up @@ -352,6 +356,7 @@ async fn test_set_task_status_with_application_no_match(#[case] adapter: &str) {
application: Some("no-matches".into()),
namespace: None,
}),
max_attempts: None,
};

let response = service.set_task_status(Request::new(request)).await;
Expand Down Expand Up @@ -386,6 +391,7 @@ async fn test_set_task_status_with_namespace_requires_application(#[case] adapte
application: None,
namespace: Some(namespace),
}),
max_attempts: None,
};

let response = service.set_task_status(Request::new(request)).await;
Expand Down Expand Up @@ -433,6 +439,7 @@ async fn test_set_task_status_forwards_to_update_channel(#[case] adapter: &str)
namespace: None,
application: None,
}),
max_attempts: None,
}))
.await
.unwrap();
Expand Down Expand Up @@ -476,6 +483,7 @@ async fn test_set_task_status_update_channel_closed_returns_internal() {
id: "id_0".to_string(),
status: 5,
fetch_next_task: None,
max_attempts: None,
}))
.await;

Expand Down
14 changes: 14 additions & 0 deletions src/kafka/deserialize.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::sync::Arc;

use anyhow::Error;
use rdkafka::Message;
use rdkafka::message::OwnedMessage;

use crate::config::Config;
Expand All @@ -12,27 +13,40 @@ use super::deserialize_raw::{self, RawConfig};
pub struct DeserializeConfig {
activation_config: DeserializeActivationConfig,
raw_config: Option<RawConfig>,
/// Retry topic always contains activations, even in raw_mode.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this comment mean? Why is the raw mode distinction important? I thought raw mode was the only mode in which we used the retry topic?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I understand now. You mean that every message in the retry topic is guaranteed to be an activation even in raw mode, whereas messages in the "normal" topic in raw mode may not be?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I understand now. You mean that every message in the retry topic is guaranteed to be an activation even in raw mode, whereas messages in the "normal" topic in raw mode may not be?

Every message in the retry topic is a TaskActivation protobuf regardless of whether the consumer is in raw mode or normal mode. This is because we need to store the retry count in the topic, somehow.

retry_topic: Option<String>,
}

impl DeserializeConfig {
pub fn from_config(config: &Config) -> Self {
Self {
activation_config: DeserializeActivationConfig::from_config(config),
raw_config: RawConfig::from_config(config),
retry_topic: config.kafka_retry_topic.clone(),
}
}
}

/// Create a unified deserializer that handles both normal and raw modes.
/// In raw mode, raw Kafka bytes are wrapped into a TaskActivation.
/// In normal mode, Kafka messages are expected to contain encoded TaskActivation protos.
/// Messages from the retry topic are always deserialized as activations.
pub fn new(
config: DeserializeConfig,
) -> impl Fn(Arc<OwnedMessage>) -> Result<InflightActivation, Error> {
let raw_deserializer = config.raw_config.map(deserialize_raw::new);
let activation_deserializer = deserialize_activation::new(config.activation_config);
let retry_topic = config.retry_topic;

move |msg: Arc<OwnedMessage>| {
// Messages from the retry topic are always activations
if let Some(ref retry_topic) = retry_topic
&& msg.topic() == retry_topic
{
return activation_deserializer(msg);
}

// For main topic: use raw deserializer in raw_mode, else activation deserializer
Comment thread
cursor[bot] marked this conversation as resolved.
if let Some(ref raw_deserializer) = raw_deserializer {
raw_deserializer(msg)
} else {
Expand Down
7 changes: 7 additions & 0 deletions src/kafka/deserialize_raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ impl RawConfig {
application
);

if let Some(ref retry_topic) = config.kafka_retry_topic {
assert!(
retry_topic != &config.kafka_topic,
"kafka_retry_topic cannot equal kafka_topic when raw_mode is enabled"
);
}

Some(Self {
namespace: config
.raw_namespace
Expand Down
22 changes: 20 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,21 @@ async fn main() -> Result<(), Error> {
if config.create_missing_topics {
let kafka_client_config = config.kafka_consumer_config();
create_missing_topics(
kafka_client_config,
kafka_client_config.clone(),
&config.kafka_topic,
config.default_topic_partitions,
)
.await?;

// Create retry topic if configured
if let Some(ref retry_topic) = config.kafka_retry_topic {
create_missing_topics(
kafka_client_config,
retry_topic,
config.default_topic_partitions,
)
.await?;
}
}

if config.full_vacuum_on_start {
Expand Down Expand Up @@ -158,11 +168,19 @@ async fn main() -> Result<(), Error> {
let consumer_store = store.clone();
let consumer_config = config.clone();
let runtime_config_manager = runtime_config_manager.clone();

// Build list of topics to consume from
let mut topics_to_consume = vec![consumer_config.kafka_topic.clone()];
if let Some(ref retry_topic) = consumer_config.kafka_retry_topic {
topics_to_consume.push(retry_topic.clone());
}

async move {
// The consumer has an internal thread that listens for cancellations, so it doesn't need
// an outer select here like the other tasks.
let topic_refs: Vec<&str> = topics_to_consume.iter().map(|s| s.as_str()).collect();
start_consumer(
&[&consumer_config.kafka_topic],
&topic_refs,
&consumer_config.kafka_consumer_config(),
consumer_store.clone(),
processing_strategy!({
Expand Down
1 change: 1 addition & 0 deletions src/push/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ impl InflightActivationStore for MockStore {
&self,
_id: &str,
_status: InflightActivationStatus,
_max_attempts: Option<u32>,
) -> anyhow::Result<Option<InflightActivation>> {
Ok(None)
}
Expand Down
Loading
Loading