getsentry · untitaker · May 11, 2026 · May 11, 2026 · May 18, 2026 · May 18, 2026
@@ -44,7 +44,7 @@ sentry = { version = "0.41.0", default-features = false, features = [
     "tracing",
     "logs"
 ] }
-sentry_protos = "0.8.13"
+sentry_protos = "0.10.0"
 serde = "1.0.214"
 serde_bytes = "0.11"
 serde_yaml = "0.9.34"

@@ -121,7 +121,11 @@ async fn set_status(num_activations: u32, num_workers: u32) {
             for task_id in 0..num_activations {
                 if task_id % num_workers == worker_idx {
                     store
-                        .set_status(&format!("id_{task_id}"), InflightActivationStatus::Complete)
+                        .set_status(
+                            &format!("id_{task_id}"),
+                            InflightActivationStatus::Complete,
+                            None,
+                        )
                         .await
                         .unwrap();
                 }

@@ -6,7 +6,7 @@ readme = "README.md"
 dependencies = [
     "sentry-arroyo>=2.38.7",
     "sentry-sdk[http2]>=2.43.0",
-    "sentry-protos>=0.8.13",
+    "sentry-protos>=0.10.0",
     "confluent_kafka>=2.3.0",
     "cronsim>=2.6",
     "grpcio>=1.67.0",

@@ -68,3 +68,4 @@ class ProcessingResult:
     status: TaskActivationStatus.ValueType
     host: str
     receive_timestamp: float
+    max_attempts: int | None = None
@@ -445,6 +445,7 @@ def update_task(
             id=processing_result.task_id,
             status=processing_result.status,
             fetch_next_task=fetch_next_task,
+            max_attempts=processing_result.max_attempts,
         )
 
         try:
@@ -566,6 +567,7 @@ def update_task(
             id=processing_result.task_id,
             status=processing_result.status,
             fetch_next_task=None,
+            max_attempts=processing_result.max_attempts,
         )
 
         retries = 0

@@ -378,6 +378,14 @@ def handle_alarm(signum: int, frame: FrameType | None) -> None:
                         status=next_state,
                         host=inflight.host,
                         receive_timestamp=inflight.receive_timestamp,
+                        # Send max_attempts if this is a retry. Don't send it
+                        # on every task as this codepath is relatively
+                        # unoptimized on the broker side.
+                        max_attempts=(
+                            task_func.retry._times + 1
+                            if task_func.retry and next_state == TASK_ACTIVATION_STATUS_RETRY
+                            else None
+                        ),
                     )
                 )
 

@@ -129,6 +129,11 @@ pub struct Config {
     /// The location to the DLQ private key file
     pub kafka_deadletter_ssl_key_location: Option<String>,
 
+    /// The topic to publish retry task activations to.
+    /// When set, retries go to this topic instead of kafka_topic.
+    /// Required for raw_mode where the main topic has other consumers.
+    pub kafka_retry_topic: Option<String>,
+
     /// The default number of partitions for a topic
     pub default_topic_partitions: i32,
 
@@ -371,6 +376,7 @@ impl Default for Config {
             kafka_deadletter_ssl_ca_location: None,
             kafka_deadletter_ssl_certificate_location: None,
             kafka_deadletter_ssl_key_location: None,
+            kafka_retry_topic: None,
             default_topic_partitions: 1,
             kafka_session_timeout_ms: 6000,
             kafka_auto_commit_interval_ms: 5000,

@@ -118,6 +118,7 @@ impl InflightActivationStore for MockStore {
         &self,
         _id: &str,
         _status: InflightActivationStatus,
+        _max_attempts: Option<u32>,
     ) -> Result<Option<InflightActivation>, Error> {
         unimplemented!()
     }

@@ -106,7 +106,14 @@ impl ConsumerService for TaskbrokerServer {
             metrics::counter!("grpc_server.set_status.failure").increment(1);
         }
 
-        if let Some(ref tx) = self.update_tx {
+        let max_attempts = request.get_ref().max_attempts;
+
+        // Use batching channel if available and we don't need to update retry state.
+        // If max_attempts is Some, we can't use batching API to update the activation, and have to
+        // fall back to individual set_status.
+        if let Some(ref tx) = self.update_tx
+            && max_attempts.is_none()
+        {
             tx.send((id, status))
                 .await
                 .map_err(|_| Status::internal("Status update channel closed"))?;
@@ -115,7 +122,7 @@ impl ConsumerService for TaskbrokerServer {
             return Ok(Response::new(SetTaskStatusResponse { task: None }));
         }
 
-        match self.store.set_status(&id, status).await {
+        match self.store.set_status(&id, status, max_attempts).await {
             Ok(Some(_)) => metrics::counter!(
                 "grpc_server.set_status",
                 "result" => "ok",

@@ -86,6 +86,7 @@ async fn test_set_task_status(#[case] adapter: &str) {
         id: "test_task".to_string(),
         status: 5, // Complete
         fetch_next_task: None,
+        max_attempts: None,
     };
 
     let response = service.set_task_status(Request::new(request)).await;
@@ -113,6 +114,7 @@ async fn test_set_task_status_invalid(#[case] adapter: &str) {
         id: "test_task".to_string(),
         status: 1, // Invalid
         fetch_next_task: None,
+        max_attempts: None,
     };
 
     let response = service.set_task_status(Request::new(request)).await;
@@ -266,6 +268,7 @@ async fn test_set_task_status_success(#[case] adapter: &str) {
             namespace: None,
             application: None,
         }),
+        max_attempts: None,
     };
     let response = service.set_task_status(Request::new(request)).await;
     assert!(response.is_ok());
@@ -306,6 +309,7 @@ async fn test_set_task_status_with_application(#[case] adapter: &str) {
             application: Some("hammers".into()),
             namespace: None,
         }),
+        max_attempts: None,
     };
 
     let response = service.set_task_status(Request::new(request)).await;
@@ -352,6 +356,7 @@ async fn test_set_task_status_with_application_no_match(#[case] adapter: &str) {
             application: Some("no-matches".into()),
             namespace: None,
         }),
+        max_attempts: None,
     };
 
     let response = service.set_task_status(Request::new(request)).await;
@@ -386,6 +391,7 @@ async fn test_set_task_status_with_namespace_requires_application(#[case] adapte
             application: None,
             namespace: Some(namespace),
         }),
+        max_attempts: None,
     };
 
     let response = service.set_task_status(Request::new(request)).await;
@@ -433,6 +439,7 @@ async fn test_set_task_status_forwards_to_update_channel(#[case] adapter: &str)
                 namespace: None,
                 application: None,
             }),
+            max_attempts: None,
         }))
         .await
         .unwrap();
@@ -476,6 +483,7 @@ async fn test_set_task_status_update_channel_closed_returns_internal() {
             id: "id_0".to_string(),
             status: 5,
             fetch_next_task: None,
+            max_attempts: None,
         }))
         .await;
 

@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use anyhow::Error;
+use rdkafka::Message;
 use rdkafka::message::OwnedMessage;
 
 use crate::config::Config;
@@ -12,27 +13,40 @@ use super::deserialize_raw::{self, RawConfig};
 pub struct DeserializeConfig {
     activation_config: DeserializeActivationConfig,
     raw_config: Option<RawConfig>,
+    /// Retry topic always contains activations, even in raw_mode.
+    retry_topic: Option<String>,
 }
 
 impl DeserializeConfig {
     pub fn from_config(config: &Config) -> Self {
         Self {
             activation_config: DeserializeActivationConfig::from_config(config),
             raw_config: RawConfig::from_config(config),
+            retry_topic: config.kafka_retry_topic.clone(),
         }
     }
 }
 
 /// Create a unified deserializer that handles both normal and raw modes.
 /// In raw mode, raw Kafka bytes are wrapped into a TaskActivation.
 /// In normal mode, Kafka messages are expected to contain encoded TaskActivation protos.
+/// Messages from the retry topic are always deserialized as activations.
 pub fn new(
     config: DeserializeConfig,
 ) -> impl Fn(Arc<OwnedMessage>) -> Result<InflightActivation, Error> {
     let raw_deserializer = config.raw_config.map(deserialize_raw::new);
     let activation_deserializer = deserialize_activation::new(config.activation_config);
+    let retry_topic = config.retry_topic;
 
     move |msg: Arc<OwnedMessage>| {
+        // Messages from the retry topic are always activations
+        if let Some(ref retry_topic) = retry_topic
+            && msg.topic() == retry_topic
+        {
+            return activation_deserializer(msg);
+        }
+
+        // For main topic: use raw deserializer in raw_mode, else activation deserializer
         if let Some(ref raw_deserializer) = raw_deserializer {
             raw_deserializer(msg)
         } else {

@@ -44,6 +44,13 @@ impl RawConfig {
             application
         );
 
+        if let Some(ref retry_topic) = config.kafka_retry_topic {
+            assert!(
+                retry_topic != &config.kafka_topic,
+                "kafka_retry_topic cannot equal kafka_topic when raw_mode is enabled"
+            );
+        }
+
         Some(Self {
             namespace: config
                 .raw_namespace

@@ -86,11 +86,21 @@ async fn main() -> Result<(), Error> {
     if config.create_missing_topics {
         let kafka_client_config = config.kafka_consumer_config();
         create_missing_topics(
-            kafka_client_config,
+            kafka_client_config.clone(),
             &config.kafka_topic,
             config.default_topic_partitions,
         )
         .await?;
+
+        // Create retry topic if configured
+        if let Some(ref retry_topic) = config.kafka_retry_topic {
+            create_missing_topics(
+                kafka_client_config,
+                retry_topic,
+                config.default_topic_partitions,
+            )
+            .await?;
+        }
     }
 
     if config.full_vacuum_on_start {
@@ -158,11 +168,19 @@ async fn main() -> Result<(), Error> {
         let consumer_store = store.clone();
         let consumer_config = config.clone();
         let runtime_config_manager = runtime_config_manager.clone();
+
+        // Build list of topics to consume from
+        let mut topics_to_consume = vec![consumer_config.kafka_topic.clone()];
+        if let Some(ref retry_topic) = consumer_config.kafka_retry_topic {
+            topics_to_consume.push(retry_topic.clone());
+        }
+
         async move {
             // The consumer has an internal thread that listens for cancellations, so it doesn't need
             // an outer select here like the other tasks.
+            let topic_refs: Vec<&str> = topics_to_consume.iter().map(|s| s.as_str()).collect();
             start_consumer(
-                &[&consumer_config.kafka_topic],
+                &topic_refs,
                 &consumer_config.kafka_consumer_config(),
                 consumer_store.clone(),
                 processing_strategy!({

@@ -100,6 +100,7 @@ impl InflightActivationStore for MockStore {
         &self,
         _id: &str,
         _status: InflightActivationStatus,
+        _max_attempts: Option<u32>,
     ) -> anyhow::Result<Option<InflightActivation>> {
         Ok(None)
     }