Skip to content

Commit

Permalink
libassistant: Add limit of restart numbers
Browse files Browse the repository at this point in the history
In this patch, we add a limit on how many times can LibAssistant service restarts in a predefined period of time to prevent consecutive crashes.

If LibAssistant service crashes `kMaxStartServiceRetries` times in total, we will not restart LibAssistant service, unless:
1. Explicitly re-enable the Assistant from the Settings,
2. Reboot the device.
3. It has been `kAutoRecoverTime` since the last crash. After
   `kAutoRecoverTime`, the failure_count will be decreased by one, in
   order to try to auto-recover gradually.

Bug: b:266610912
Test: Add unit tests
Change-Id: I3738bedba8df1d01613f2b8ff041f9b7d5fe8e3c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4220237
Commit-Queue: Tao Wu <wutao@chromium.org>
Reviewed-by: Xiaohui Chen <xiaohuic@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1103992}
  • Loading branch information
Tao Wu authored and Chromium LUCI CQ committed Feb 10, 2023
1 parent a889510 commit 62f7ecd
Show file tree
Hide file tree
Showing 4 changed files with 359 additions and 10 deletions.
85 changes: 77 additions & 8 deletions chromeos/ash/services/assistant/service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@ const char* g_s3_server_uri_override = nullptr;
// device.
const char* g_device_id_override = nullptr;

// If LibAssistant service crashes `kMaxStartServiceRetries` times in total, we
// will not restart LibAssistant, unless
// 1. Explicitly re-enable the Assistant from the Settings,
// 2. Reboot the device.
// 3. It has been `kAutoRecoverTime` since the last crash.
constexpr int kMaxStartServiceRetries = 5;

// An interval used to gradually reduce the failure_count so that we could
// restart.
constexpr base::TimeDelta kAutoRecoverTime = base::Hours(24);

constexpr net::BackoffEntry::Policy kRetryStartServiceBackoffPolicy = {
1, // Number of initial errors to ignore.
1000, // Initial delay in ms.
2.0, // Factor by which the waiting time will be multiplied.
0.2, // Fuzzing percentage.
60 * 1000, // Maximum delay in ms.
-1, // Never discard the entry.
true, // Use initial delay.
};

AssistantStatus ToAssistantStatus(AssistantManagerService::State state) {
using State = AssistantManagerService::State;

Expand Down Expand Up @@ -195,7 +216,9 @@ Service::Service(std::unique_ptr<network::PendingSharedURLLoaderFactory>
identity_manager_(identity_manager),
token_refresh_timer_(std::make_unique<base::OneShotTimer>()),
main_task_runner_(base::SequencedTaskRunner::GetCurrentDefault()),
pending_url_loader_factory_(std::move(pending_url_loader_factory)) {
pending_url_loader_factory_(std::move(pending_url_loader_factory)),
start_service_retry_backoff_(&kRetryStartServiceBackoffPolicy),
auto_service_recover_timer_(std::make_unique<base::OneShotTimer>()) {
DCHECK(identity_manager_);
chromeos::PowerManagerClient* power_manager_client =
context_->power_manager_client();
Expand Down Expand Up @@ -310,6 +333,8 @@ void Service::OnAssistantHotwordAlwaysOn(bool hotword_always_on) {
}

void Service::OnAssistantSettingsEnabled(bool enabled) {
// Reset the failure count and backoff delay when the Settings is re-enabled.
start_service_retry_backoff_.Reset();
UpdateAssistantManagerState();
}

Expand Down Expand Up @@ -366,6 +391,10 @@ void Service::UpdateAssistantManagerState() {
return;
}

if (start_service_retry_backoff_.failure_count() > kMaxStartServiceRetries) {
return;
}

if (IsSignedOutMode()) {
// Clear |access_token_| in signed-out mode to keep it synced with what we
// will pass to the |assistant_manager_service_|.
Expand Down Expand Up @@ -406,10 +435,10 @@ void Service::UpdateAssistantManagerState() {
return;
}
// Wait if |assistant_manager_service_| is not at a stable state.
ScheduleUpdateAssistantManagerState();
ScheduleUpdateAssistantManagerState(/*should_backoff=*/false);
break;
case AssistantManagerService::State::STOPPING:
ScheduleUpdateAssistantManagerState();
ScheduleUpdateAssistantManagerState(/*should_backoff=*/false);
break;
case AssistantManagerService::State::RUNNING:
if (assistant_state->settings_enabled().value()) {
Expand All @@ -426,13 +455,16 @@ void Service::UpdateAssistantManagerState() {
}
}

void Service::ScheduleUpdateAssistantManagerState() {
void Service::ScheduleUpdateAssistantManagerState(bool should_backoff) {
update_assistant_manager_callback_.Cancel();
update_assistant_manager_callback_.Reset(base::BindOnce(
&Service::UpdateAssistantManagerState, weak_ptr_factory_.GetWeakPtr()));

base::TimeDelta delay =
should_backoff ? start_service_retry_backoff_.GetTimeUntilRelease()
: kUpdateAssistantManagerDelay;
main_task_runner_->PostDelayedTask(
FROM_HERE, update_assistant_manager_callback_.callback(),
kUpdateAssistantManagerDelay);
FROM_HERE, update_assistant_manager_callback_.callback(), delay);
}

CoreAccountInfo Service::RetrievePrimaryAccountInfo() const {
Expand Down Expand Up @@ -558,8 +590,23 @@ void Service::OnLibassistantServiceStopped() {
void Service::OnLibassistantServiceDisconnected() {
ClearAfterStop();

// Restarts LibassistantService.
ScheduleUpdateAssistantManagerState();
if (auto_service_recover_timer_->IsRunning()) {
auto_service_recover_timer_->Stop();
}
start_service_retry_backoff_.InformOfRequest(/*succeeded=*/false);
if (start_service_retry_backoff_.failure_count() <= kMaxStartServiceRetries) {
LOG(WARNING) << "LibAssistant service disconnected. Re-starting...";

// Restarts LibassistantService.
ScheduleUpdateAssistantManagerState(/*should_backoff=*/true);
} else {
// Start auto recover timer.
auto delay = GetAutoRecoverTime();
auto_service_recover_timer_->Start(FROM_HERE, delay, this,
&Service::DecreaseStartServiceBackoff);
LOG(ERROR)
<< "LibAssistant service keeps disconnected. All retries attempted.";
}
}

void Service::AddAshSessionObserver() {
Expand Down Expand Up @@ -633,4 +680,26 @@ void Service::ClearAfterStop() {
ResetAuthenticationStateObserver();
}

void Service::DecreaseStartServiceBackoff() {
// Reduce the failure_count by one to allow restart.
start_service_retry_backoff_.InformOfRequest(/*succeeded=*/true);

// It is ok to try to reset service if the service is running.
ScheduleUpdateAssistantManagerState(/*should_backoff=*/true);

// Start auto recover timer.
if (start_service_retry_backoff_.failure_count() > 0) {
auto delay = GetAutoRecoverTime();
auto_service_recover_timer_->Start(FROM_HERE, delay, this,
&Service::DecreaseStartServiceBackoff);
}
}

base::TimeDelta Service::GetAutoRecoverTime() {
if (!auto_recover_time_for_testing_.is_zero()) {
return auto_recover_time_for_testing_;
}
return kAutoRecoverTime;
}

} // namespace ash::assistant
20 changes: 19 additions & 1 deletion chromeos/ash/services/assistant/service.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "chromeos/dbus/power/power_manager_client.h"
#include "components/signin/public/identity_manager/account_info.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "net/base/backoff_entry.h"
#include "third_party/abseil-cpp/absl/types/optional.h"

class GoogleServiceAuthError;
Expand Down Expand Up @@ -123,7 +124,7 @@ class COMPONENT_EXPORT(ASSISTANT_SERVICE) Service
void OnStateChanged(AssistantManagerService::State new_state) override;

void UpdateAssistantManagerState();
void ScheduleUpdateAssistantManagerState();
void ScheduleUpdateAssistantManagerState(bool should_backoff);

CoreAccountInfo RetrievePrimaryAccountInfo() const;
void RequestAccessToken();
Expand Down Expand Up @@ -160,6 +161,13 @@ class COMPONENT_EXPORT(ASSISTANT_SERVICE) Service

void ClearAfterStop();

void DecreaseStartServiceBackoff();

base::TimeDelta GetAutoRecoverTime();
void SetAutoRecoverTimeForTesting(base::TimeDelta delay) {
auto_recover_time_for_testing_ = delay;
}

// |ServiceContext| object passed to child classes so they can access some of
// our functionality without depending on us.
// Note: this is used by the other members here, so it must be defined first
Expand Down Expand Up @@ -199,6 +207,16 @@ class COMPONENT_EXPORT(ASSISTANT_SERVICE) Service
std::unique_ptr<network::PendingSharedURLLoaderFactory>
pending_url_loader_factory_;

// If Libassistant service is disconnected, will use this backoff entry to
// restart the service.
net::BackoffEntry start_service_retry_backoff_;

// A timer used to slowly recover from previous crashes by reducing the
// `start_service_retry_backoff_` failure_count by one for every
// `kAutoRecoverTime`.
std::unique_ptr<base::OneShotTimer> auto_service_recover_timer_;
base::TimeDelta auto_recover_time_for_testing_;

base::CancelableOnceClosure update_assistant_manager_callback_;

std::unique_ptr<signin::AccessTokenFetcher> access_token_fetcher_;
Expand Down

0 comments on commit 62f7ecd

Please sign in to comment.