From f40b6825409037bba580243f04bb13404ead4236 Mon Sep 17 00:00:00 2001 From: Matthew Grange Date: Fri, 27 Mar 2026 11:32:53 -0700 Subject: [PATCH] Add exponential backoff to Ax DB retry operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: The Axolotl experiment `igfr_h2_toprank_brew_ax_tuning` failed with a MySQL OperationalError (1290) during a database failover while saving analysis cards. The MySQL server was temporarily in read-only mode during master switchover. The existing `retry_on_exception` decorator on DB save/update functions in `with_db_settings_base.py` correctly catches `OperationalError` and retries up to 3 times, but it had no wait between retries (`initial_wait_seconds` was not set). This means all 3 retries fired immediately and all failed because the failover hadn't completed yet. This diff adds `initial_wait_seconds=5` to all 7 retry-decorated DB operation functions. This enables exponential backoff between retries: - 1st attempt: immediate - 2nd attempt: after 5 second wait - 3rd attempt: after 10 second wait This gives MySQL failovers up to 15 seconds to complete, which should be sufficient for typical failover scenarios. The `initial_wait_seconds` parameter is already supported by the `retry_on_exception` decorator in `ax.utils.common.executils` — it was simply not being used. Functions updated: - `_save_experiment_to_db_if_possible` - `_save_or_update_trials_in_db_if_possible` - `_save_generation_strategy_to_db_if_possible` - `_update_generation_strategy_in_db_if_possible` - `_update_runner_on_experiment_in_db_if_possible` - `_update_experiment_properties_in_db` - `_save_analysis_card_to_db` Differential Revision: D98166115 --- ax/storage/sqa_store/with_db_settings_base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ax/storage/sqa_store/with_db_settings_base.py b/ax/storage/sqa_store/with_db_settings_base.py index 2e80187624b..5ad5acb6c03 100644 --- a/ax/storage/sqa_store/with_db_settings_base.py +++ b/ax/storage/sqa_store/with_db_settings_base.py @@ -498,6 +498,7 @@ def _save_analysis_card_to_db_if_possible( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _save_experiment_to_db_if_possible( experiment: Experiment, @@ -521,6 +522,7 @@ def _save_experiment_to_db_if_possible( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _save_or_update_trials_in_db_if_possible( experiment: Experiment, @@ -550,6 +552,7 @@ def _save_or_update_trials_in_db_if_possible( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _save_generation_strategy_to_db_if_possible( generation_strategy: GenerationStrategy, @@ -573,6 +576,7 @@ def _save_generation_strategy_to_db_if_possible( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _update_generation_strategy_in_db_if_possible( generation_strategy: GenerationStrategy, @@ -602,6 +606,7 @@ def _update_generation_strategy_in_db_if_possible( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _update_runner_on_experiment_in_db_if_possible( experiment: Experiment, @@ -619,6 +624,7 @@ def _update_runner_on_experiment_in_db_if_possible( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _update_experiment_properties_in_db( experiment_with_updated_properties: Experiment, @@ -635,6 +641,7 @@ def _update_experiment_properties_in_db( retries=3, default_return_on_suppression=False, exception_types=RETRY_EXCEPTION_TYPES, + initial_wait_seconds=5, ) def _save_analysis_card_to_db( experiment: Experiment,